10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru******************************************************************************* 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Copyright (C) 2004-2010, International Business Machines 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru******************************************************************************* 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* file name: xmlparser.cpp 110596faeddefbf198de137d5e893708495ab1584cFredrik Roubert* encoding: UTF-8 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tab size: 8 (not used) 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* indentation:4 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* created on: 2004jul21 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* created by: Andy Heninger 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h> 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/regex.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "filestrm.h" 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "xmlparser.h" 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// character constants 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruenum { 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_QUOT=0x22, 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_AMP=0x26, 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_APOS=0x27, 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_LT=0x3c, 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_GT=0x3e, 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x_l=0x6c 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define XML_SPACES "[ \\u0009\\u000d\\u000a]" 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// XML #4 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// XML #5 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// XML #6 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// UXMLParser constructor. Mostly just initializes the ICU regexes that are 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// used for parsing. 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::UXMLParser(UErrorCode &status) : 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Declaration. XML Production #23. 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example: "<?xml version=1.0 encoding="utf-16" ?> 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This is a sloppy implementation - just look for the leading <?xml and the closing ?> 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // allow for a possible leading BOM. 66c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Comment production #15 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example: "<!-- whatever --> 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note, does not detect an illegal "--" within comments 71c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Spaces 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // production [3] 75c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Doctype decl production #28 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example "<!DOCTYPE foo SYSTEM "somewhere" > 79c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // or "<!DOCTYPE foo [internal dtd]> 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: we don't actually parse the DOCTYPE or internal subsets. 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Some internal dtd subsets could confuse this simple-minded 82c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // attempt at skipping over them, specifically, occcurences 83c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of closeing square brackets. These could appear in comments, 84c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // or in parameter entity declarations, for example. 85c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLDoctype(UnicodeString( 86c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV 87c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ), 0, status), 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML PI production #16 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example "<?target stuff?> 91c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Element Start Productions #40, #41 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example <foo att1='abc' att2="d e f" > 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // capture #1: the tag name 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "(?:" 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ")*" // * for zero or more attributes. 102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Element End production #42 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example </foo> 106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Element Empty production #44 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // example <foo att1="abc" att2="d e f" /> 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "(?:" 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ")*" // * for zero or more attributes. 115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XMLCharData. Everything but '<'. Note that & will be dealt with later. 119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Attribute name = "value". XML Productions 10, 40/41 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Capture group 1 is name, 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 is the attribute value, including the quotes. 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note that attributes are scanned twice. The first time is with 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the regex for an entire element start. There, the attributes 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are checked syntactically, but not separted out one by one. 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Here, we match a single attribute, and make its name and 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // attribute value available to the parser code. 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" 131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match any of the new-line sequences in content. 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // All are changed to \u000a. 138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // & char references 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We will figure out what we've got based on which capture group has content. 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The last one is a catchall for unrecognized entity references.. 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1 2 3 4 5 6 7 8 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, status), 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNames(status), 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fElementStack(status), 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser * 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::createParser(UErrorCode &errorCode) { 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(errorCode)) { 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return new UXMLParser(errorCode); 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::~UXMLParser() {} 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement * 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char bytes[4096], charsetBuffer[100]; 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FileStream *f; 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *charset, *pb; 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString src; 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UConverter *cnv; 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *buffer, *pu; 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fileLength, bytesLength, length, capacity; 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool flush; 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru f=T_FileStream_open(filename, "rb"); 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(f==NULL) { 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorCode=U_FILE_ACCESS_ERROR; 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(bytesLength<(int32_t)sizeof(bytes)) { 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we have already read the entire file 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileLength=bytesLength; 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // get the file length 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileLength=T_FileStream_size(f); 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * get the charset: 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 1. Unicode signature 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 2. treat as ISO-8859-1 and read XML encoding="charser" 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 3. default to UTF-8 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_SUCCESS(errorCode) && charset!=NULL) { 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // open converter according to Unicode signature 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv=ucnv_open(charset, &errorCode); 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // read as Latin-1 and parse the XML declaration and encoding 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv=ucnv_open("ISO-8859-1", &errorCode); 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unexpected error opening Latin-1 converter 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto exit; 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2120596faeddefbf198de137d5e893708495ab1584cFredrik Roubert buffer=toUCharPtr(src.getBuffer(bytesLength)); 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(buffer==NULL) { 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unexpected failure to reserve some string capacity 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto exit; 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pb=bytes; 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pu=buffer; 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_toUnicode( 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv, 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &pu, buffer+src.getCapacity(), 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &pb, bytes+bytesLength, 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, TRUE, &errorCode); 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_close(cnv); 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv=NULL; 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unexpected error in conversion from Latin-1 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.remove(); 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto exit; 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parse XML declaration 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t declEnd=mXMLDecl.end(errorCode); 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // go beyond <?xml 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t pos=src.indexOf((UChar)x_l)+1; 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAttrValue.reset(src); 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attName = mAttrValue.group(1, errorCode); 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attValue = mAttrValue.group(2, errorCode); 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Trim the quotes from the att value. These are left over from the original regex 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that parsed the attribue, which couldn't conveniently strip them. 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue.remove(0,1); // one char from the beginning 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue.truncate(attValue.length()-1); // and one from the end. 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(attName==UNICODE_STRING("encoding", 8)) { 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charset=charsetBuffer; 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = mAttrValue.end(2, errorCode); 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(charset==NULL) { 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // default to UTF-8 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charset="UTF-8"; 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv=ucnv_open(charset, &errorCode); 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unable to open the converter 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto exit; 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // convert the file contents 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru capacity=fileLength; // estimated capacity 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.getBuffer(capacity); 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.releaseBuffer(0); // zero length 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru flush=FALSE; 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(;;) { 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // convert contents of bytes[bytesLength] 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pb=bytes; 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(;;) { 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru length=src.length(); 2810596faeddefbf198de137d5e893708495ab1584cFredrik Roubert buffer=toUCharPtr(src.getBuffer(capacity)); 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(buffer==NULL) { 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unexpected failure to reserve some string capacity 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto exit; 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pu=buffer+length; 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_toUnicode( 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cnv, &pu, buffer+src.getCapacity(), 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &pb, bytes+bytesLength, 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, FALSE, &errorCode); 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorCode=U_ZERO_ERROR; 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru capacity=(3*src.getCapacity())/2; // increase capacity by 50% 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; // conversion error 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(flush) { 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; // completely converted the file 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // read next block 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(bytesLength==0) { 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // reached end of file, convert once more to flush the converter 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru flush=TRUE; 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruexit: 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_close(cnv); 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru T_FileStream_close(f); 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_SUCCESS(errorCode)) { 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return parse(src, errorCode); 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement * 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::parse(const UnicodeString &src, UErrorCode &status) { 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *root = NULL; 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = 0; // TODO use just a local pos variable and pass it into functions 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // where necessary? 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set all matchers to work on the input string 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLDecl.reset(src); 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLComment.reset(src); 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLSP.reset(src); 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLDoctype.reset(src); 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLPI.reset(src); 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLElemStart.reset(src); 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLElemEnd.reset(src); 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLElemEmpty.reset(src); 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLCharData.reset(src); 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAttrValue.reset(src); 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAttrNormalizer.reset(src); 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mNewLineNormalizer.reset(src); 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAmps.reset(src); 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Consume the XML Declaration, if present. 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLDecl.lookingAt(fPos, status)) { 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLDecl.end(status); 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Consume "misc" [XML production 27] appearing before DocType 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseMisc(status); 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Consume a DocType declaration, if present. 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLDoctype.lookingAt(fPos, status)) { 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLDoctype.end(status); 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Consume additional "misc" [XML production 27] appearing after the DocType 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseMisc(status); 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the root element 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLElemEmpty.lookingAt(fPos, status)) { 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Root is an empty element (no nested elements or content) 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru root = createElement(mXMLElemEmpty, status); 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLElemEmpty.end(status); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error("Root Element expected", status); 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto errorExit; 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru root = createElement(mXMLElemStart, status); 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *el = root; 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This is the loop that consumes the root element of the document, 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // including all nested content. Nested elements are handled by 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // explicit pushes/pops of the element stack; there is no recursion 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the control flow of this code. 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // "el" always refers to the current element, the one to which content 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is being added. It is above the top of the element stack. 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Nested Element Start 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLElemStart.lookingAt(fPos, status)) { 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *t = createElement(mXMLElemStart, status); 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el->fChildren.addElement(t, status); 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru t->fParent = el; 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fElementStack.push(el, status); 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el = t; 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Text Content. String is concatenated onto the current node's content, 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // but only if it contains something other than spaces. 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString s = scanContent(status); 404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (s.length() > 0) { 405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLSP.reset(s); 406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLSP.matches(status) == FALSE) { 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This chunk of text contains something other than just 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // white space. Make a child node for it. 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replaceCharRefs(s, status); 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el->fChildren.addElement(s.clone(), status); 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mXMLSP.reset(src); // The matchers need to stay set to the main input string. 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Comments. Discard. 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLComment.lookingAt(fPos, status)) { 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLComment.end(status); 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // PIs. Discard. 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLPI.lookingAt(fPos, status)) { 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLPI.end(status); 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Element End 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLElemEnd.lookingAt(fPos, status)) { 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLElemEnd.end(0, status); 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString name = mXMLElemEnd.group(1, status); 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (name != *el->fName) { 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error("Element start / end tag mismatch", status); 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto errorExit; 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fElementStack.empty()) { 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Close of the root element. We're done with the doc. 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el = NULL; 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el = (UXMLElement *)fElementStack.pop(); 442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Empty Element. Stored as a child of the current element, but not stacked. 446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLElemEmpty.lookingAt(fPos, status)) { 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *t = createElement(mXMLElemEmpty, status); 448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el->fChildren.addElement(t, status); 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Hit something within the document that doesn't match anything. 453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // It's an error. 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error("Unrecognized markup", status); 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (el != NULL || !fElementStack.empty()) { 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We bailed out early, for some reason. 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error("Root element not closed.", status); 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto errorExit; 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Root Element parse is complete. 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Consume the annoying xml "Misc" that can appear at the end of the doc. 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseMisc(status); 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We should have reached the end of the input 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPos != src.length()) { 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error("Extra content at the end of the document", status); 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto errorExit; 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Success! 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return root; 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruerrorExit: 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete root; 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// createElement 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// We've just matched an element start tag. Create and fill in a UXMLElement object 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for it. 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement * 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // First capture group is the element's name. 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan for attributes. 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t pos = mEl.end(1, status); // The position after the end of the tag name 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attName = mAttrValue.group(1, status); 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attValue = mAttrValue.group(2, status); 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Trim the quotes from the att value. These are left over from the original regex 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that parsed the attribue, which couldn't conveniently strip them. 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue.remove(0,1); // one char from the beginning 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue.truncate(attValue.length()-1); // and one from the end. 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // XML Attribue value normalization. 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This is one of the really screwy parts of the XML spec. 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note that non-validating parsers must treat all entities as type CDATA 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // which simplifies things some. 510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Att normalization step 1: normalize any newlines in the attribute value 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mNewLineNormalizer.reset(attValue); 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue = mNewLineNormalizer.replaceAll(fOneLF, status); 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Next change all xml white space chars to plain \u0020 spaces. 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAttrNormalizer.reset(attValue); 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString oneSpace((UChar)0x0020); 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru attValue = mAttrNormalizer.replaceAll(oneSpace, status); 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Replace character entities. 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replaceCharRefs(attValue, status); 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save the attribute name and value in our document structure. 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el->fAttNames.addElement((void *)intern(attName, status), status); 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el->fAttValues.addElement(attValue.clone(), status); 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = mAttrValue.end(2, status); 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mEl.end(0, status); 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return el; 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parseMisc 534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Consume XML "Misc" [production #27] 535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// which is any combination of space, PI and comments 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Need to watch end-of-input because xml MISC stuff is allowed after 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the document element, so we WILL scan off the end in this function 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::parseMisc(UErrorCode &status) { 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPos >= mXMLPI.input().length()) { 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLPI.lookingAt(fPos, status)) { 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLPI.end(status); 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLSP.lookingAt(fPos, status)) { 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLSP.end(status); 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLComment.lookingAt(fPos, status)) { 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLComment.end(status); 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Scan for document content. 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::scanContent(UErrorCode &status) { 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString result; 567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mXMLCharData.lookingAt(fPos, status)) { 56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho result = mXMLCharData.group((int32_t)0, status); 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normalize the new-lines. (Before char ref substitution) 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mNewLineNormalizer.reset(result); 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = mNewLineNormalizer.replaceAll(fOneLF, status); 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: handle CDATA 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPos = mXMLCharData.end(0, status); 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// replaceCharRefs 582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// replace the char entities < & { ካ etc. in a string 584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// with the corresponding actual character. 585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid 587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { 588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString result; 589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString replacement; 590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAmps.reset(s); 593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See the initialization for the regex matcher mAmps. 594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Which entity we've matched is determined by which capture group has content, 595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // which is flaged by start() of that group not being -1. 596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (mAmps.find()) { 597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (mAmps.start(1, status) != -1) { 598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo((UChar)x_AMP); 599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(2, status) != -1) { 600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo((UChar)x_LT); 601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(3, status) != -1) { 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo((UChar)x_GT); 603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(4, status) != -1) { 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo((UChar)x_APOS); 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(5, status) != -1) { 606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo((UChar)x_QUOT); 607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(6, status) != -1) { 608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString hexString = mAmps.group(6, status); 609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 val = 0; 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<hexString.length(); i++) { 611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru val = (val << 4) + u_digit(hexString.charAt(i), 16); 612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: some verification that the character is valid 614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo(val); 615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (mAmps.start(7, status) != -1) { 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString decimalString = mAmps.group(7, status); 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 val = 0; 618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<decimalString.length(); i++) { 619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru val = val*10 + u_digit(decimalString.charAt(i), 10); 620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: some verification that the character is valid 622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru replacement.setTo(val); 623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // An unrecognized &entity; Leave it alone. 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: check that it really looks like an entity, and is not some 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // random & in the text. 62750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho replacement = mAmps.group((int32_t)0, status); 628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAmps.appendReplacement(result, replacement, status); 630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mAmps.appendTail(result); 632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s = result; 633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::error(const char *message, UErrorCode &status) { 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: something better here... 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString &src=mXMLDecl.input(); 639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int line = 0; 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ci = 0; 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ci < fPos && ci>=0) { 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ci = src.indexOf((UChar)0x0a, ci+1); 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru line++; 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Error: %s at line %d\n", message, line); 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_PARSE_ERROR; 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// intern strings like in Java 652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UnicodeString * 654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *he=fNames.find(s); 656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(he!=NULL) { 657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // already a known name, return its hashed key pointer 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (const UnicodeString *)he->key.pointer; 659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // add this new name and return its hashed key pointer 661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNames.puti(s, 0, errorCode); 662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru he=fNames.find(s); 663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (const UnicodeString *)he->key.pointer; 664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UnicodeString * 668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLParser::findName(const UnicodeString &s) const { 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *he=fNames.find(s); 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(he!=NULL) { 671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a known name, return its hashed key pointer 672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (const UnicodeString *)he->key.pointer; 673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // unknown name 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// UXMLElement ------------------------------------------------------------- *** 680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : 682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParser(parser), 683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fName(name), 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAttNames(errorCode), 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAttValues(errorCode), 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fChildren(errorCode), 687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParent(NULL) 688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::~UXMLElement() { 692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // attribute names are owned by the UXMLParser, don't delete them here 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=fAttValues.size()-1; i>=0; i--) { 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete (UObject *)fAttValues.elementAt(i); 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=fChildren.size()-1; i>=0; i--) { 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete (UObject *)fChildren.elementAt(i); 699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UnicodeString & 703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getTagName() const { 704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return *fName; 705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString 708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getText(UBool recurse) const { 709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString text; 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru appendText(text, recurse); 711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return text; 712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid 715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::appendText(UnicodeString &text, UBool recurse) const { 716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UObject *node; 717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i, count=fChildren.size(); 718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(i=0; i<count; ++i) { 719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru node=(const UObject *)fChildren.elementAt(i); 72027f654740f2a26ad62a5c155af9199af9e69b889claireho const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); 72127f654740f2a26ad62a5c155af9199af9e69b889claireho if(s!=NULL) { 72227f654740f2a26ad62a5c155af9199af9e69b889claireho text.append(*s); 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if(recurse) /* must be a UXMLElement */ { 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ((const UXMLElement *)node)->appendText(text, recurse); 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t 730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::countAttributes() const { 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fAttNames.size(); 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UnicodeString * 735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(0<=i && i<fAttNames.size()) { 737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); 738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return &value; // or return (UnicodeString *)fAttValues.elementAt(i); 740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UnicodeString * 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getAttribute(const UnicodeString &name) const { 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // search for the attribute name by comparing the interned pointer, 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not the string contents 749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *p=fParser->findName(name); 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(p==NULL) { 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; // no such attribute seen by the parser at all 752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i, count=fAttNames.size(); 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(i=0; i<count; ++i) { 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(p==(const UnicodeString *)fAttNames.elementAt(i)) { 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (const UnicodeString *)fAttValues.elementAt(i); 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::countChildren() const { 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fChildren.size(); 766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UObject * 769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getChild(int32_t i, UXMLNodeType &type) const { 770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(0<=i && i<fChildren.size()) { 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UObject *node=(const UObject *)fChildren.elementAt(i); 77227f654740f2a26ad62a5c155af9199af9e69b889claireho if(dynamic_cast<const UXMLElement *>(node)!=NULL) { 773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru type=UXML_NODE_TYPE_ELEMENT; 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru type=UXML_NODE_TYPE_STRING; 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return node; 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UXMLElement * 784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::nextChildElement(int32_t &i) const { 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(i<0) { 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UObject *node; 790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count=fChildren.size(); 791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(i<count) { 792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru node=(const UObject *)fChildren.elementAt(i++); 79327f654740f2a26ad62a5c155af9199af9e69b889claireho const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 79427f654740f2a26ad62a5c155af9199af9e69b889claireho if(elem!=NULL) { 79527f654740f2a26ad62a5c155af9199af9e69b889claireho return elem; 796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst UXMLElement * 802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUXMLElement::getChildElement(const UnicodeString &name) const { 803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // search for the element name by comparing the interned pointer, 804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not the string contents 805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *p=fParser->findName(name); 806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(p==NULL) { 807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; // no such element seen by the parser at all 808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UObject *node; 811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i, count=fChildren.size(); 812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(i=0; i<count; ++i) { 813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru node=(const UObject *)fChildren.elementAt(i); 81427f654740f2a26ad62a5c155af9199af9e69b889claireho const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 81527f654740f2a26ad62a5c155af9199af9e69b889claireho if(elem!=NULL) { 816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(p==elem->fName) { 817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return elem; 818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 828