13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""A parser for XML, using the derived class as static DTD.""" 23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Author: Sjoerd Mullender. 43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport re 63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport string 73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport warnings 93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielwarnings.warn("The xmllib module is obsolete. Use xml.sax instead.", 103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel DeprecationWarning, 2) 113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldel warnings 123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielversion = '0.3' 143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Error(RuntimeError): 163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Regular expressions used for parsing 193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_S = '[ \t\r\n]+' # white space 213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_opS = '[ \t\r\n]*' # optional white space 223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name 233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string 243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielillegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content 253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielinteresting = re.compile('[]&<]') 263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielamp = re.compile('&') 283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') 293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielentityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]') 303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcharref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') 313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielspace = re.compile(_S + '$') 323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielnewline = re.compile('\n') 333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielattrfind = re.compile( 353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _S + '(?P<name>' + _Name + ')' 363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(' + _opS + '=' + _opS + 373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?') 383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagopen = re.compile('<' + _Name) 393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagend = re.compile(_opS + '(?P<slash>/?)>') 403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagmatch = re.compile('<(?P<tagname>'+_Name+')' 413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?P<attrs>(?:'+attrfind.pattern+')*)'+ 423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel starttagend.pattern) 433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendtagopen = re.compile('</') 443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendbracket = re.compile(_opS + '>') 453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>') 463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltagfind = re.compile(_Name) 473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcdataopen = re.compile(r'<!\[CDATA\[') 483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcdataclose = re.compile(r'\]\]>') 493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# this matches one of the following: 503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# SYSTEM SystemLiteral 513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# PUBLIC PubidLiteral SystemLiteral 523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_SystemLiteral = '(?P<%s>'+_QStr+')' 533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ 543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" 553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_ExternalId = '(?:SYSTEM|' \ 563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ 573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ')'+_S+_SystemLiteral%'syslit' 583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldoctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')' 593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?:'+_S+_ExternalId+')?'+_opS) 603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielxmldecl = re.compile('<\?xml'+_S+ 613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+ 623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?:'+_S+'encoding'+_opS+'='+_opS+ 633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|" 643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '"[A-Za-z][-A-Za-z0-9._]*"))?' 653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?:'+_S+'standalone'+_opS+'='+_opS+ 663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+ 673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _opS+'\?>') 683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielprocopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS) 693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielprocclose = re.compile(_opS + r'\?>') 703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcommentopen = re.compile('<!--') 713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcommentclose = re.compile('-->') 723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldoubledash = re.compile('--') 733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielattrtrans = string.maketrans(' \r\n\t', ' ') 743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# definitions for XML namespaces 763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" 773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielncname = re.compile(_NCName + '$') 783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielqname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix 793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?P<local>' + _NCName + ')$') 803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielxmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') 823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# XML parser base class -- find tags and call handler functions. 843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Usage: p = XMLParser(); p.feed(data); ...; p.close(). 853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# The dtd is defined by deriving a class which defines methods with 863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# special names to handle tags: start_foo and end_foo to handle <foo> 873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# and </foo>, respectively. The data between tags is passed to the 883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# parser by calling self.handle_data() with some data as argument (the 893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# data may be split up in arbitrary chunks). 903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass XMLParser: 923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attributes = {} # default, to be overridden 933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elements = {} # default, to be overridden 943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # parsing options, settable using keyword args in __init__ 963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __accept_unquoted_attributes = 0 973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __accept_missing_endtag_name = 0 983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __map_case = 0 993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __accept_utf8 = 0 1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __translate_attribute_references = 1 1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface -- initialize and reset this instance 1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, **kw): 1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixed = 0 1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if 'accept_unquoted_attributes' in kw: 1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__accept_unquoted_attributes = kw['accept_unquoted_attributes'] 1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if 'accept_missing_endtag_name' in kw: 1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__accept_missing_endtag_name = kw['accept_missing_endtag_name'] 1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if 'map_case' in kw: 1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__map_case = kw['map_case'] 1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if 'accept_utf8' in kw: 1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__accept_utf8 = kw['accept_utf8'] 1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if 'translate_attribute_references' in kw: 1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__translate_attribute_references = kw['translate_attribute_references'] 1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.reset() 1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __fixelements(self): 1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixed = 1 1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.elements = {} 1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixdict(self.__dict__) 1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixclass(self.__class__) 1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __fixclass(self, kl): 1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixdict(kl.__dict__) 1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for k in kl.__bases__: 1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixclass(k) 1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __fixdict(self, dict): 1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key in dict.keys(): 1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if key[:6] == 'start_': 1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = key[6:] 1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel start, end = self.elements.get(tag, (None, None)) 1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if start is None: 1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.elements[tag] = getattr(self, key), end 1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif key[:4] == 'end_': 1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = key[4:] 1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel start, end = self.elements.get(tag, (None, None)) 1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end is None: 1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.elements[tag] = start, getattr(self, key) 1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface -- reset this instance. Loses all unprocessed data 1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def reset(self): 1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.rawdata = '' 1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.stack = [] 1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.nomoretags = 0 1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.literal = 0 1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = 1 1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__at_start = 1 1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__seen_doctype = None 1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__seen_starttag = 0 1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__use_namespaces = 0 1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__namespaces = {'xml':None} # xml is implicitly declared 1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # backward compatibility hack: if elements not overridden, 1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # fill it in ourselves 1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.elements is XMLParser.elements: 1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixelements() 1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # For derived classes only -- enter literal mode (CDATA) till EOF 1593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def setnomoretags(self): 1603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.nomoretags = self.literal = 1 1613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # For derived classes only -- enter literal mode (CDATA) 1633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def setliteral(self, *args): 1643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.literal = 1 1653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface -- feed some data to the parser. Call this as 1673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # often as you want, with as little or as much text as you 1683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # want (may include '\n'). (This just saves the text, all the 1693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # processing is done by goahead().) 1703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def feed(self, data): 1713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.rawdata = self.rawdata + data 1723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.goahead(0) 1733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface -- handle the remaining data 1753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def close(self): 1763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.goahead(1) 1773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__fixed: 1783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__fixed = 0 1793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # remove self.elements so that we don't leak 1803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel del self.elements 1813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface -- translate references 1833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def translate_references(self, data, all = 1): 1843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__translate_attribute_references: 1853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return data 1863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = 0 1873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while 1: 1883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = amp.search(data, i) 1893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 1903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return data 1913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel s = res.start(0) 1923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = ref.match(data, s) 1933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 1943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `&'") 1953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = s+1 1963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 1973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.end(0) 1983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel str = res.group(1) 1993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rescan = 0 2003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if str[0] == '#': 2013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if str[1] == 'x': 2023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel str = chr(int(str[2:], 16)) 2033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel str = chr(int(str[1:])) 2053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if data[i - 1] != ';': 2063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("`;' missing after char reference") 2073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i-1 2083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif all: 2093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if str in self.entitydefs: 2103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel str = self.entitydefs[str] 2113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rescan = 1 2123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif data[i - 1] != ';': 2133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `&'") 2143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = s + 1 # just past the & 2153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("reference to unknown entity `&%s;'" % str) 2183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel str = '&' + str + ';' 2193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif data[i - 1] != ';': 2203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `&'") 2213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = s + 1 # just past the & 2223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # when we get here, str contains the translated text and i points 2253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # to the end of the string that is to be replaced 2263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = data[:s] + str + data[i:] 2273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rescan: 2283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = s 2293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = s + len(str) 2313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Interface - return a dictionary of all namespaces currently valid 2333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def getnamespace(self): 2343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nsdict = {} 2353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for t, d, nst in self.stack: 2363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nsdict.update(d) 2373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return nsdict 2383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- handle data as far as reasonable. May leave state 2403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # and data to be processed by a subsequent call. If 'end' is 2413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # true, force handling all data as if followed by EOF marker. 2423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def goahead(self, end): 2433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 2443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = 0 2453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = len(rawdata) 2463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while i < n: 2473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if i > 0: 2483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__at_start = 0 2493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.nomoretags: 2503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i:n] 2513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 2523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 2533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = n 2543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 2553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = interesting.search(rawdata, i) 2563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res: 2573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel j = res.start(0) 2583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel j = n 2603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if i < j: 2613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i:j] 2623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__at_start and space.match(data) is None: 2633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal data at start of file') 2643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__at_start = 0 2653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.stack and space.match(data) is None: 2663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('data not in content') 2673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_utf8 and illegal.search(data): 2683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal character in content') 2693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 2703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 2713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = j 2723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if i == n: break 2733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[i] == '<': 2743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if starttagopen.match(rawdata, i): 2753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 2763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 2773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 2783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 2793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 2803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_starttag(i) 2823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 2833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__seen_starttag = 1 2843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 2853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 2863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if endtagopen.match(rawdata, i): 2883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_endtag(i) 2893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 2903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 2913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 2923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if commentopen.match(rawdata, i): 2943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 2953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 2963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 2973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 2983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 2993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_comment(i) 3013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 3023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 3033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 3043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if cdataopen.match(rawdata, i): 3063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_cdata(i) 3073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 3083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 3093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 3103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = xmldecl.match(rawdata, i) 3123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res: 3133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__at_start: 3143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("<?xml?> declaration not at start of document") 3153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel version, encoding, standalone = res.group('version', 3163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'encoding', 3173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'standalone') 3183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if version[1:-1] != '1.0': 3193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('only XML version 1.0 supported') 3203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if encoding: encoding = encoding[1:-1] 3213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if standalone: standalone = standalone[1:-1] 3223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_xml(encoding, standalone) 3233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.end(0) 3243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = procopen.match(rawdata, i) 3263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res: 3273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_proc(i) 3283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 3293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 3303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 3313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = doctype.match(rawdata, i) 3333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res: 3343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 3353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 3363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 3373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 3383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 3393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__seen_doctype: 3413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('multiple DOCTYPE elements') 3423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__seen_starttag: 3433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('DOCTYPE not at beginning of document') 3443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = self.parse_doctype(res) 3453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k < 0: break 3463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__seen_doctype = res.group('name') 3473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 3483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__seen_doctype = self.__seen_doctype.lower() 3493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + rawdata[i:k].count('\n') 3503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = k 3513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif rawdata[i] == '&': 3533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 3543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 3553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 3563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 3573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = charref.match(rawdata, i) 3593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is not None: 3603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.end(0) 3613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[i-1] != ';': 3623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("`;' missing in charref") 3633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i-1 3643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.stack: 3653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('data not in content') 3663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_charref(res.group('char')[:-1]) 3673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + res.group(0).count('\n') 3683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = entityref.match(rawdata, i) 3703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is not None: 3713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.end(0) 3723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[i-1] != ';': 3733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("`;' missing in entityref") 3743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i-1 3753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = res.group('name') 3763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 3773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = name.lower() 3783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if name in self.entitydefs: 3793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] 3803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = len(rawdata) 3813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.start(0) 3823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 3833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_entityref(name) 3843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + res.group(0).count('\n') 3853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif rawdata[i] == ']': 3873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 3883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 3893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 3903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 3913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if n-i < 3: 3933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 3943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if cdataclose.match(rawdata, i): 3953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `]]>'") 3963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(rawdata[i]) 3973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = i+1 3983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 4003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('neither < nor & ??') 4013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # We get here only if incomplete matches but 4023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # nothing else 4033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 4043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # end while 4053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if i > 0: 4063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__at_start = 0 4073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end and i < n: 4083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = rawdata[i] 4093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `%s'" % data) 4103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_utf8 and illegal.search(data): 4113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal character in content') 4123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(data) 4133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.lineno = self.lineno + data.count('\n') 4143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.rawdata = rawdata[i+1:] 4153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self.goahead(end) 4163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.rawdata = rawdata[i:] 4173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end: 4183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__seen_starttag: 4193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('no elements in file') 4203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.stack: 4213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('missing end tags') 4223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while self.stack: 4233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.finish_endtag(self.stack[-1][0]) 4243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- parse comment, return length or -1 if not terminated 4263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_comment(self, i): 4273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 4283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[i:i+4] != '<!--': 4293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('unexpected call to handle_comment') 4303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = commentclose.search(rawdata, i+4) 4313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 4323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 4333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if doubledash.search(rawdata, i+4, res.start(0)): 4343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("`--' inside comment") 4353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[res.start(0)-1] == '-': 4363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('comment cannot end in three dashes') 4373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_utf8 and \ 4383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel illegal.search(rawdata, i+4, res.start(0)): 4393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal character in comment') 4403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_comment(rawdata[i+4: res.start(0)]) 4413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return res.end(0) 4423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- handle DOCTYPE tag, return length or -1 if not terminated 4443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_doctype(self, res): 4453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 4463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = len(rawdata) 4473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = res.group('name') 4483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 4493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = name.lower() 4503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pubid, syslit = res.group('pubid', 'syslit') 4513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if pubid is not None: 4523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pubid = pubid[1:-1] # remove quotes 4533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pubid = ' '.join(pubid.split()) # normalize 4543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if syslit is not None: syslit = syslit[1:-1] # remove quotes 4553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel j = k = res.end(0) 4563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if k >= n: 4573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 4583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[k] == '[': 4593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel level = 0 4603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = k+1 4613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dq = sq = 0 4623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while k < n: 4633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel c = rawdata[k] 4643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not sq and c == '"': 4653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dq = not dq 4663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif not dq and c == "'": 4673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel sq = not sq 4683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif sq or dq: 4693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 4703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif level <= 0 and c == ']': 4713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = endbracket.match(rawdata, k+1) 4723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 4733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 4743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_doctype(name, pubid, syslit, rawdata[j+1:k]) 4753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return res.end(0) 4763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif c == '<': 4773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel level = level + 1 4783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif c == '>': 4793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel level = level - 1 4803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if level < 0: 4813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("bogus `>' in DOCTYPE") 4823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = k+1 4833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = endbracketfind.match(rawdata, k) 4843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 4853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 4863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if endbracket.match(rawdata, k) is None: 4873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('garbage in DOCTYPE') 4883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_doctype(name, pubid, syslit, None) 4893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return res.end(0) 4903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- handle CDATA tag, return length or -1 if not terminated 4923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_cdata(self, i): 4933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 4943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if rawdata[i:i+9] != '<![CDATA[': 4953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('unexpected call to parse_cdata') 4963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = cdataclose.search(rawdata, i+9) 4973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 4983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 4993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_utf8 and \ 5003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel illegal.search(rawdata, i+9, res.start(0)): 5013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal character in CDATA') 5023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.stack: 5033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('CDATA not in content') 5043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_cdata(rawdata[i+9:res.start(0)]) 5053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return res.end(0) 5063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 5073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} 5083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- handle a processing instruction tag 5093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_proc(self, i): 5103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 5113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel end = procclose.search(rawdata, i) 5123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end is None: 5133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 5143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel j = end.start(0) 5153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_utf8 and illegal.search(rawdata, i+2, j): 5163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal character in processing instruction') 5173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = tagfind.match(rawdata, i+2) 5183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 5193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('unexpected call to parse_proc') 5203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = res.end(0) 5213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = res.group(0) 5223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 5233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name = name.lower() 5243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if name == 'xml:namespace': 5253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('old-fashioned namespace declaration') 5263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__use_namespaces = -1 5273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # namespace declaration 5283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # this must come after the <?xml?> declaration (if any) 5293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # and before the <!DOCTYPE> (if any). 5303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__seen_doctype or self.__seen_starttag: 5313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('xml:namespace declaration too late in document') 5323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict, namespace, k = self.parse_attributes(name, k, j) 5333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if namespace: 5343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('namespace declaration inside namespace declaration') 5353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for attrname in attrdict.keys(): 5363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not attrname in self.__xml_namespace_attributes: 5373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) 5383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not 'ns' in attrdict or not 'prefix' in attrdict: 5393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('xml:namespace without required attributes') 5403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel prefix = attrdict.get('prefix') 5413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ncname.match(prefix) is None: 5423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('xml:namespace illegal prefix value') 5433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return end.end(0) 5443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if prefix in self.__namespaces: 5453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('xml:namespace prefix not unique') 5463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__namespaces[prefix] = attrdict['ns'] 5473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 5483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if name.lower() == 'xml': 5493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('illegal processing instruction target name') 5503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_proc(name, rawdata[k:j]) 5513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return end.end(0) 5523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 5533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- parse attributes between i and j 5543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_attributes(self, tag, i, j): 5553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 5563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict = {} 5573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel namespace = {} 5583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while i < j: 5593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = attrfind.match(rawdata, i) 5603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 5613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 5623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrname, attrvalue = res.group('name', 'value') 5633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 5643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrname = attrname.lower() 5653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel i = res.end(0) 5663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if attrvalue is None: 5673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("no value specified for attribute `%s'" % attrname) 5683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrvalue = attrname 5693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif attrvalue[:1] == "'" == attrvalue[-1:] or \ 5703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrvalue[:1] == '"' == attrvalue[-1:]: 5713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrvalue = attrvalue[1:-1] 5723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif not self.__accept_unquoted_attributes: 5733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("attribute `%s' value not quoted" % attrname) 5743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = xmlns.match(attrname) 5753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is not None: 5763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # namespace declaration 5773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ncname = res.group('ncname') 5783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel namespace[ncname or ''] = attrvalue or None 5793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__use_namespaces: 5803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__use_namespaces = len(self.stack)+1 5813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 5823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if '<' in attrvalue: 5833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("`<' illegal in attribute value") 5843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if attrname in attrdict: 5853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("attribute `%s' specified twice" % attrname) 5863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrvalue = attrvalue.translate(attrtrans) 5873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict[attrname] = self.translate_references(attrvalue) 5883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return attrdict, namespace, i 5893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 5903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- handle starttag, return length or -1 if not terminated 5913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_starttag(self, i): 5923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 5933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # i points to start of tag 5943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel end = endbracketfind.match(rawdata, i+1) 5953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end is None: 5963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 5973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = starttagmatch.match(rawdata, i) 5983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if tag is None or tag.end(0) != end.end(0): 5993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('garbage in starttag') 6003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return end.end(0) 6013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nstag = tagname = tag.group('tagname') 6023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 6033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nstag = tagname = nstag.lower() 6043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__seen_starttag and self.__seen_doctype and \ 6053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tagname != self.__seen_doctype: 6063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('starttag does not match DOCTYPE') 6073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__seen_starttag and not self.stack: 6083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('multiple elements on top level') 6093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k, j = tag.span('attrs') 6103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict, nsdict, k = self.parse_attributes(tagname, k, j) 6113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.stack.append((tagname, nsdict, nstag)) 6123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__use_namespaces: 6133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = qname.match(tagname) 6143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 6153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = None 6163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is not None: 6173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel prefix, nstag = res.group('prefix', 'local') 6183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if prefix is None: 6193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel prefix = '' 6203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ns = None 6213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for t, d, nst in self.stack: 6223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if prefix in d: 6233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ns = d[prefix] 6243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ns is None and prefix != '': 6253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ns = self.__namespaces.get(prefix) 6263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ns is not None: 6273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nstag = ns + ' ' + nstag 6283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif prefix != '': 6293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nstag = prefix + ':' + nstag # undo split 6303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.stack[-1] = tagname, nsdict, nstag 6313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # translate namespace of attributes 6323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrnamemap = {} # map from new name to old name (used for error reporting) 6333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key in attrdict.keys(): 6343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrnamemap[key] = key 6353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__use_namespaces: 6363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nattrdict = {} 6373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key, val in attrdict.items(): 6383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel okey = key 6393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = qname.match(key) 6403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is not None: 6413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel aprefix, key = res.group('prefix', 'local') 6423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 6433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel key = key.lower() 6443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if aprefix is not None: 6453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ans = None 6463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for t, d, nst in self.stack: 6473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if aprefix in d: 6483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ans = d[aprefix] 6493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ans is None: 6503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ans = self.__namespaces.get(aprefix) 6513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ans is not None: 6523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel key = ans + ' ' + key 6533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 6543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel key = aprefix + ':' + key 6553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nattrdict[key] = val 6563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrnamemap[key] = okey 6573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict = nattrdict 6583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attributes = self.attributes.get(nstag) 6593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if attributes is not None: 6603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key in attrdict.keys(): 6613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not key in attributes: 6623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname)) 6633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key, val in attributes.items(): 6643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if val is not None and not key in attrdict: 6653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel attrdict[key] = val 6663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel method = self.elements.get(nstag, (None, None))[0] 6673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.finish_starttag(nstag, attrdict, method) 6683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if tag.group('slash') == '/': 6693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.finish_endtag(tagname) 6703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return tag.end(0) 6713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 6723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- parse endtag 6733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def parse_endtag(self, i): 6743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rawdata = self.rawdata 6753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel end = endbracketfind.match(rawdata, i+1) 6763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if end is None: 6773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return -1 6783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel res = tagfind.match(rawdata, i+2) 6793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if res is None: 6803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 6813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(rawdata[i]) 6823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return i+1 6833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.__accept_missing_endtag_name: 6843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('no name specified in end tag') 6853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = self.stack[-1][0] 6863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = i+2 6873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 6883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = res.group(0) 6893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__map_case: 6903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel tag = tag.lower() 6913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.literal: 6923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not self.stack or tag != self.stack[-1][0]: 6933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(rawdata[i]) 6943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return i+1 6953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel k = res.end(0) 6963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if endbracket.match(rawdata, k) is None: 6973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('garbage in end tag') 6983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.finish_endtag(tag) 6993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return end.end(0) 7003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- finish processing of start tag 7023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def finish_starttag(self, tagname, attrdict, method): 7033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if method is not None: 7043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_starttag(tagname, method, attrdict) 7053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 7063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_starttag(tagname, attrdict) 7073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Internal -- finish processing of end tag 7093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def finish_endtag(self, tag): 7103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.literal = 0 7113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not tag: 7123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('name-less end tag') 7133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel found = len(self.stack) - 1 7143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if found < 0: 7153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_endtag(tag) 7163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return 7173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 7183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel found = -1 7193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for i in range(len(self.stack)): 7203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if tag == self.stack[i][0]: 7213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel found = i 7223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if found == -1: 7233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('unopened end tag') 7243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return 7253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while len(self.stack) > found: 7263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if found < len(self.stack) - 1: 7273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error('missing close tag for %s' % self.stack[-1][2]) 7283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel nstag = self.stack[-1][2] 7293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel method = self.elements.get(nstag, (None, None))[1] 7303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if method is not None: 7313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_endtag(nstag, method) 7323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 7333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_endtag(nstag) 7343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__use_namespaces == len(self.stack): 7353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.__use_namespaces = 0 7363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel del self.stack[-1] 7373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Overridable -- handle xml processing instruction 7393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_xml(self, encoding, standalone): 7403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Overridable -- handle DOCTYPE 7433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_doctype(self, tag, pubid, syslit, data): 7443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Overridable -- handle start tag 7473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_starttag(self, tag, method, attrs): 7483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel method(attrs) 7493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Overridable -- handle end tag 7513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_endtag(self, tag, method): 7523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel method() 7533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle character reference, no need to override 7553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_charref(self, name): 7563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 7573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if name[0] == 'x': 7583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = int(name[1:], 16) 7593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 7603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = int(name) 7613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except ValueError: 7623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_charref(name) 7633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return 7643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not 0 <= n <= 255: 7653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.unknown_charref(name) 7663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return 7673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.handle_data(chr(n)) 7683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Definition of entities -- derived classes may override 7703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel entitydefs = {'lt': '<', # must use charref 7713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'gt': '>', 7723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'amp': '&', # must use charref 7733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'quot': '"', 7743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'apos': ''', 7753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel } 7763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle data, should be overridden 7783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_data(self, data): 7793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle cdata, could be overridden 7823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_cdata(self, data): 7833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle comment, could be overridden 7863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_comment(self, data): 7873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle processing instructions, could be overridden 7903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_proc(self, name, data): 7913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 7923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Example -- handle relatively harmless syntax errors, could be overridden 7943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def syntax_error(self, message): 7953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error('Syntax error at line %d: %s' % (self.lineno, message)) 7963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # To be overridden -- handlers for unknown objects 7983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_starttag(self, tag, attrs): pass 7993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_endtag(self, tag): pass 8003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_charref(self, ref): pass 8013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_entityref(self, name): 8023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.syntax_error("reference to unknown entity `&%s;'" % name) 8033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass TestXMLParser(XMLParser): 8063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, **kw): 8083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.testdata = "" 8093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel XMLParser.__init__(self, **kw) 8103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_xml(self, encoding, standalone): 8123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'xml: encoding =',encoding,'standalone =',standalone 8143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_doctype(self, tag, pubid, syslit, data): 8163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'DOCTYPE:',tag, repr(data) 8183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_data(self, data): 8203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.testdata = self.testdata + data 8213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(repr(self.testdata)) >= 70: 8223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def flush(self): 8253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = self.testdata 8263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if data: 8273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.testdata = "" 8283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'data:', repr(data) 8293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_cdata(self, data): 8313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'cdata:', repr(data) 8333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_proc(self, name, data): 8353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'processing:',name,repr(data) 8373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def handle_comment(self, data): 8393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel r = repr(data) 8413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(r) > 68: 8423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel r = r[:32] + '...' + r[-32:] 8433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'comment:', r 8443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def syntax_error(self, message): 8463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'error at line %d:' % self.lineno, message 8473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_starttag(self, tag, attrs): 8493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not attrs: 8513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'start tag: <' + tag + '>' 8523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 8533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'start tag: <' + tag, 8543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for name, value in attrs.items(): 8553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print name + '=' + '"' + value + '"', 8563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print '>' 8573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_endtag(self, tag): 8593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'end tag: </' + tag + '>' 8613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_entityref(self, ref): 8633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print '*** unknown entity ref: &' + ref + ';' 8653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def unknown_charref(self, ref): 8673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print '*** unknown char ref: &#' + ref + ';' 8693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def close(self): 8713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel XMLParser.close(self) 8723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.flush() 8733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef test(args = None): 8753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel import sys, getopt 8763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel from time import time 8773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not args: 8793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel args = sys.argv[1:] 8803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel opts, args = getopt.getopt(args, 'st') 8823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel klass = TestXMLParser 8833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel do_time = 0 8843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for o, a in opts: 8853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if o == '-s': 8863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel klass = XMLParser 8873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif o == '-t': 8883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel do_time = 1 8893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if args: 8913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel file = args[0] 8923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 8933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel file = 'test.xml' 8943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 8953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if file == '-': 8963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel f = sys.stdin 8973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 8983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 8993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel f = open(file, 'r') 9003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except IOError, msg: 9013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print file, ":", msg 9023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel sys.exit(1) 9033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 9043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = f.read() 9053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if f is not sys.stdin: 9063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel f.close() 9073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 9083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel x = klass() 9093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel t0 = time() 9103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 9113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if do_time: 9123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel x.feed(data) 9133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel x.close() 9143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 9153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for c in data: 9163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel x.feed(c) 9173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel x.close() 9183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except Error, msg: 9193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel t1 = time() 9203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print msg 9213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if do_time: 9223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'total time: %g' % (t1-t0) 9233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel sys.exit(1) 9243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel t1 = time() 9253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if do_time: 9263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel print 'total time: %g' % (t1-t0) 9273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 9283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 9293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielif __name__ == '__main__': 9303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel test() 931