13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""A parser for XML, using the derived class as static DTD."""
23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Author: Sjoerd Mullender.
43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport re
63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport string
73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport warnings
93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielwarnings.warn("The xmllib module is obsolete.  Use xml.sax instead.",
103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel              DeprecationWarning, 2)
113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldel warnings
123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielversion = '0.3'
143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Error(RuntimeError):
163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    pass
173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Regular expressions used for parsing
193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_S = '[ \t\r\n]+'                       # white space
213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_opS = '[ \t\r\n]*'                     # optional white space
223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielillegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielinteresting = re.compile('[]&<]')
263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielamp = re.compile('&')
283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielentityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcharref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielspace = re.compile(_S + '$')
323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielnewline = re.compile('\n')
333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielattrfind = re.compile(
353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    _S + '(?P<name>' + _Name + ')'
363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    '(' + _opS + '=' + _opS +
373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagopen = re.compile('<' + _Name)
393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagend = re.compile(_opS + '(?P<slash>/?)>')
403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielstarttagmatch = re.compile('<(?P<tagname>'+_Name+')'
413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '(?P<attrs>(?:'+attrfind.pattern+')*)'+
423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      starttagend.pattern)
433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendtagopen = re.compile('</')
443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendbracket = re.compile(_opS + '>')
453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielendbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltagfind = re.compile(_Name)
473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcdataopen = re.compile(r'<!\[CDATA\[')
483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcdataclose = re.compile(r'\]\]>')
493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# this matches one of the following:
503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# SYSTEM SystemLiteral
513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# PUBLIC PubidLiteral SystemLiteral
523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_SystemLiteral = '(?P<%s>'+_QStr+')'
533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_ExternalId = '(?:SYSTEM|' \
563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel              ')'+_S+_SystemLiteral%'syslit'
583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldoctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                     '(?:'+_S+_ExternalId+')?'+_opS)
603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielxmldecl = re.compile('<\?xml'+_S+
613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                     'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                     '(?:'+_S+'encoding'+_opS+'='+_opS+
633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                     '(?:'+_S+'standalone'+_opS+'='+_opS+
663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                     _opS+'\?>')
683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielprocopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielprocclose = re.compile(_opS + r'\?>')
703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcommentopen = re.compile('<!--')
713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcommentclose = re.compile('-->')
723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldoubledash = re.compile('--')
733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielattrtrans = string.maketrans(' \r\n\t', '    ')
743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# definitions for XML namespaces
763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielncname = re.compile(_NCName + '$')
783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielqname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                   '(?P<local>' + _NCName + ')$')
803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielxmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# XML parser base class -- find tags and call handler functions.
843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Usage: p = XMLParser(); p.feed(data); ...; p.close().
853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# The dtd is defined by deriving a class which defines methods with
863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# special names to handle tags: start_foo and end_foo to handle <foo>
873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# and </foo>, respectively.  The data between tags is passed to the
883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# parser by calling self.handle_data() with some data as argument (the
893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# data may be split up in arbitrary chunks).
903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass XMLParser:
923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    attributes = {}                     # default, to be overridden
933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    elements = {}                       # default, to be overridden
943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # parsing options, settable using keyword args in __init__
963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __accept_unquoted_attributes = 0
973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __accept_missing_endtag_name = 0
983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __map_case = 0
993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __accept_utf8 = 0
1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __translate_attribute_references = 1
1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface -- initialize and reset this instance
1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, **kw):
1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__fixed = 0
1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if 'accept_unquoted_attributes' in kw:
1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if 'accept_missing_endtag_name' in kw:
1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if 'map_case' in kw:
1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__map_case = kw['map_case']
1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if 'accept_utf8' in kw:
1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__accept_utf8 = kw['accept_utf8']
1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if 'translate_attribute_references' in kw:
1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__translate_attribute_references = kw['translate_attribute_references']
1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.reset()
1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __fixelements(self):
1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__fixed = 1
1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.elements = {}
1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__fixdict(self.__dict__)
1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__fixclass(self.__class__)
1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __fixclass(self, kl):
1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__fixdict(kl.__dict__)
1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for k in kl.__bases__:
1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__fixclass(k)
1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __fixdict(self, dict):
1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for key in dict.keys():
1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if key[:6] == 'start_':
1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                tag = key[6:]
1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                start, end = self.elements.get(tag, (None, None))
1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if start is None:
1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.elements[tag] = getattr(self, key), end
1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif key[:4] == 'end_':
1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                tag = key[4:]
1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                start, end = self.elements.get(tag, (None, None))
1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if end is None:
1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.elements[tag] = start, getattr(self, key)
1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface -- reset this instance.  Loses all unprocessed data
1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def reset(self):
1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.rawdata = ''
1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.stack = []
1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.nomoretags = 0
1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.literal = 0
1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.lineno = 1
1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__at_start = 1
1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__seen_doctype = None
1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__seen_starttag = 0
1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__use_namespaces = 0
1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.__namespaces = {'xml':None}   # xml is implicitly declared
1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # backward compatibility hack: if elements not overridden,
1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # fill it in ourselves
1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.elements is XMLParser.elements:
1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__fixelements()
1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # For derived classes only -- enter literal mode (CDATA) till EOF
1593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def setnomoretags(self):
1603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.nomoretags = self.literal = 1
1613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # For derived classes only -- enter literal mode (CDATA)
1633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def setliteral(self, *args):
1643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.literal = 1
1653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface -- feed some data to the parser.  Call this as
1673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # often as you want, with as little or as much text as you
1683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # want (may include '\n').  (This just saves the text, all the
1693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # processing is done by goahead().)
1703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def feed(self, data):
1713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.rawdata = self.rawdata + data
1723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.goahead(0)
1733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface -- handle the remaining data
1753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def close(self):
1763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.goahead(1)
1773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__fixed:
1783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__fixed = 0
1793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # remove self.elements so that we don't leak
1803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            del self.elements
1813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface -- translate references
1833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def translate_references(self, data, all = 1):
1843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.__translate_attribute_references:
1853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return data
1863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        i = 0
1873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while 1:
1883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = amp.search(data, i)
1893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if res is None:
1903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return data
1913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            s = res.start(0)
1923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = ref.match(data, s)
1933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if res is None:
1943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("bogus `&'")
1953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = s+1
1963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
1973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            i = res.end(0)
1983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            str = res.group(1)
1993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            rescan = 0
2003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if str[0] == '#':
2013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if str[1] == 'x':
2023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    str = chr(int(str[2:], 16))
2033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
2043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    str = chr(int(str[1:]))
2053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if data[i - 1] != ';':
2063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("`;' missing after char reference")
2073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = i-1
2083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif all:
2093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if str in self.entitydefs:
2103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    str = self.entitydefs[str]
2113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    rescan = 1
2123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif data[i - 1] != ';':
2133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("bogus `&'")
2143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = s + 1 # just past the &
2153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
2163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
2173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("reference to unknown entity `&%s;'" % str)
2183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    str = '&' + str + ';'
2193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif data[i - 1] != ';':
2203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("bogus `&'")
2213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = s + 1 # just past the &
2223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
2233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # when we get here, str contains the translated text and i points
2253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # to the end of the string that is to be replaced
2263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            data = data[:s] + str + data[i:]
2273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if rescan:
2283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = s
2293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
2303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = s + len(str)
2313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Interface - return a dictionary of all namespaces currently valid
2333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def getnamespace(self):
2343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        nsdict = {}
2353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for t, d, nst in self.stack:
2363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            nsdict.update(d)
2373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return nsdict
2383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- handle data as far as reasonable.  May leave state
2403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # and data to be processed by a subsequent call.  If 'end' is
2413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # true, force handling all data as if followed by EOF marker.
2423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def goahead(self, end):
2433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
2443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        i = 0
2453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        n = len(rawdata)
2463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while i < n:
2473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if i > 0:
2483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.__at_start = 0
2493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.nomoretags:
2503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                data = rawdata[i:n]
2513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.handle_data(data)
2523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.lineno = self.lineno + data.count('\n')
2533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = n
2543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                break
2553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = interesting.search(rawdata, i)
2563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if res:
2573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                j = res.start(0)
2583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
2593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                j = n
2603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if i < j:
2613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                data = rawdata[i:j]
2623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if self.__at_start and space.match(data) is None:
2633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error('illegal data at start of file')
2643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.__at_start = 0
2653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not self.stack and space.match(data) is None:
2663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error('data not in content')
2673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not self.__accept_utf8 and illegal.search(data):
2683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error('illegal character in content')
2693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.handle_data(data)
2703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.lineno = self.lineno + data.count('\n')
2713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            i = j
2723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if i == n: break
2733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if rawdata[i] == '<':
2743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if starttagopen.match(rawdata, i):
2753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.literal:
2763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        data = rawdata[i]
2773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.handle_data(data)
2783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.lineno = self.lineno + data.count('\n')
2793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = i+1
2803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        continue
2813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_starttag(i)
2823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
2833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.__seen_starttag = 1
2843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
2853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = k
2863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
2873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if endtagopen.match(rawdata, i):
2883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_endtag(i)
2893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
2903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
2913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i =  k
2923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
2933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if commentopen.match(rawdata, i):
2943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.literal:
2953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        data = rawdata[i]
2963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.handle_data(data)
2973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.lineno = self.lineno + data.count('\n')
2983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = i+1
2993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        continue
3003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_comment(i)
3013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
3023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
3033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = k
3043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if cdataopen.match(rawdata, i):
3063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_cdata(i)
3073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
3083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
3093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = k
3103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = xmldecl.match(rawdata, i)
3123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res:
3133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if not self.__at_start:
3143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error("<?xml?> declaration not at start of document")
3153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    version, encoding, standalone = res.group('version',
3163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                                              'encoding',
3173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                                              'standalone')
3183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if version[1:-1] != '1.0':
3193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        raise Error('only XML version 1.0 supported')
3203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if encoding: encoding = encoding[1:-1]
3213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if standalone: standalone = standalone[1:-1]
3223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_xml(encoding, standalone)
3233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = res.end(0)
3243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = procopen.match(rawdata, i)
3263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res:
3273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_proc(i)
3283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
3293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
3303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = k
3313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = doctype.match(rawdata, i)
3333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res:
3343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.literal:
3353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        data = rawdata[i]
3363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.handle_data(data)
3373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.lineno = self.lineno + data.count('\n')
3383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = i+1
3393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        continue
3403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.__seen_doctype:
3413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error('multiple DOCTYPE elements')
3423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.__seen_starttag:
3433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error('DOCTYPE not at beginning of document')
3443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    k = self.parse_doctype(res)
3453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if k < 0: break
3463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.__seen_doctype = res.group('name')
3473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.__map_case:
3483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.__seen_doctype = self.__seen_doctype.lower()
3493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + rawdata[i:k].count('\n')
3503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = k
3513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif rawdata[i] == '&':
3533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if self.literal:
3543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    data = rawdata[i]
3553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_data(data)
3563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = i+1
3573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = charref.match(rawdata, i)
3593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res is not None:
3603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = res.end(0)
3613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if rawdata[i-1] != ';':
3623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error("`;' missing in charref")
3633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = i-1
3643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if not self.stack:
3653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error('data not in content')
3663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_charref(res.group('char')[:-1])
3673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + res.group(0).count('\n')
3683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = entityref.match(rawdata, i)
3703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res is not None:
3713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = res.end(0)
3723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if rawdata[i-1] != ';':
3733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error("`;' missing in entityref")
3743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = i-1
3753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    name = res.group('name')
3763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.__map_case:
3773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        name = name.lower()
3783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if name in self.entitydefs:
3793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
3803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        n = len(rawdata)
3813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        i = res.start(0)
3823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    else:
3833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.unknown_entityref(name)
3843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.lineno = self.lineno + res.group(0).count('\n')
3853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif rawdata[i] == ']':
3873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if self.literal:
3883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    data = rawdata[i]
3893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_data(data)
3903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    i = i+1
3913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if n-i < 3:
3933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    break
3943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if cdataclose.match(rawdata, i):
3953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("bogus `]]>'")
3963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.handle_data(rawdata[i])
3973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                i = i+1
3983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
3993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
4003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                raise Error('neither < nor & ??')
4013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # We get here only if incomplete matches but
4023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # nothing else
4033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            break
4043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # end while
4053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if i > 0:
4063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__at_start = 0
4073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if end and i < n:
4083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            data = rawdata[i]
4093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error("bogus `%s'" % data)
4103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if not self.__accept_utf8 and illegal.search(data):
4113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('illegal character in content')
4123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.handle_data(data)
4133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.lineno = self.lineno + data.count('\n')
4143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.rawdata = rawdata[i+1:]
4153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return self.goahead(end)
4163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.rawdata = rawdata[i:]
4173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if end:
4183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if not self.__seen_starttag:
4193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('no elements in file')
4203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.stack:
4213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('missing end tags')
4223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                while self.stack:
4233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.finish_endtag(self.stack[-1][0])
4243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- parse comment, return length or -1 if not terminated
4263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_comment(self, i):
4273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
4283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if rawdata[i:i+4] != '<!--':
4293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise Error('unexpected call to handle_comment')
4303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        res = commentclose.search(rawdata, i+4)
4313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is None:
4323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
4333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if doubledash.search(rawdata, i+4, res.start(0)):
4343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error("`--' inside comment")
4353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if rawdata[res.start(0)-1] == '-':
4363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('comment cannot end in three dashes')
4373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.__accept_utf8 and \
4383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           illegal.search(rawdata, i+4, res.start(0)):
4393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('illegal character in comment')
4403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.handle_comment(rawdata[i+4: res.start(0)])
4413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return res.end(0)
4423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
4443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_doctype(self, res):
4453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
4463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        n = len(rawdata)
4473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        name = res.group('name')
4483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__map_case:
4493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            name = name.lower()
4503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pubid, syslit = res.group('pubid', 'syslit')
4513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if pubid is not None:
4523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            pubid = pubid[1:-1]         # remove quotes
4533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            pubid = ' '.join(pubid.split()) # normalize
4543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if syslit is not None: syslit = syslit[1:-1] # remove quotes
4553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        j = k = res.end(0)
4563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if k >= n:
4573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
4583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if rawdata[k] == '[':
4593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            level = 0
4603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            k = k+1
4613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            dq = sq = 0
4623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            while k < n:
4633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                c = rawdata[k]
4643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not sq and c == '"':
4653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    dq = not dq
4663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif not dq and c == "'":
4673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    sq = not sq
4683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif sq or dq:
4693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    pass
4703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif level <= 0 and c == ']':
4713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    res = endbracket.match(rawdata, k+1)
4723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if res is None:
4733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        return -1
4743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
4753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    return res.end(0)
4763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif c == '<':
4773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    level = level + 1
4783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                elif c == '>':
4793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    level = level - 1
4803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if level < 0:
4813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        self.syntax_error("bogus `>' in DOCTYPE")
4823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                k = k+1
4833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        res = endbracketfind.match(rawdata, k)
4843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is None:
4853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
4863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if endbracket.match(rawdata, k) is None:
4873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('garbage in DOCTYPE')
4883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.handle_doctype(name, pubid, syslit, None)
4893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return res.end(0)
4903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- handle CDATA tag, return length or -1 if not terminated
4923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_cdata(self, i):
4933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
4943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if rawdata[i:i+9] != '<![CDATA[':
4953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise Error('unexpected call to parse_cdata')
4963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        res = cdataclose.search(rawdata, i+9)
4973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is None:
4983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
4993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.__accept_utf8 and \
5003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           illegal.search(rawdata, i+9, res.start(0)):
5013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('illegal character in CDATA')
5023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.stack:
5033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('CDATA not in content')
5043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.handle_cdata(rawdata[i+9:res.start(0)])
5053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return res.end(0)
5063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
5073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
5083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- handle a processing instruction tag
5093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_proc(self, i):
5103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
5113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        end = procclose.search(rawdata, i)
5123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if end is None:
5133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
5143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        j = end.start(0)
5153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
5163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('illegal character in processing instruction')
5173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        res = tagfind.match(rawdata, i+2)
5183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is None:
5193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise Error('unexpected call to parse_proc')
5203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        k = res.end(0)
5213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        name = res.group(0)
5223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__map_case:
5233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            name = name.lower()
5243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if name == 'xml:namespace':
5253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('old-fashioned namespace declaration')
5263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__use_namespaces = -1
5273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # namespace declaration
5283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # this must come after the <?xml?> declaration (if any)
5293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # and before the <!DOCTYPE> (if any).
5303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.__seen_doctype or self.__seen_starttag:
5313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('xml:namespace declaration too late in document')
5323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrdict, namespace, k = self.parse_attributes(name, k, j)
5333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if namespace:
5343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('namespace declaration inside namespace declaration')
5353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for attrname in attrdict.keys():
5363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not attrname in self.__xml_namespace_attributes:
5373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
5383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if not 'ns' in attrdict or not 'prefix' in attrdict:
5393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('xml:namespace without required attributes')
5403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            prefix = attrdict.get('prefix')
5413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if ncname.match(prefix) is None:
5423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('xml:namespace illegal prefix value')
5433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return end.end(0)
5443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if prefix in self.__namespaces:
5453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('xml:namespace prefix not unique')
5463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.__namespaces[prefix] = attrdict['ns']
5473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
5483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if name.lower() == 'xml':
5493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('illegal processing instruction target name')
5503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.handle_proc(name, rawdata[k:j])
5513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return end.end(0)
5523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
5533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- parse attributes between i and j
5543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_attributes(self, tag, i, j):
5553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
5563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        attrdict = {}
5573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        namespace = {}
5583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while i < j:
5593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = attrfind.match(rawdata, i)
5603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if res is None:
5613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                break
5623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrname, attrvalue = res.group('name', 'value')
5633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.__map_case:
5643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                attrname = attrname.lower()
5653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            i = res.end(0)
5663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if attrvalue is None:
5673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("no value specified for attribute `%s'" % attrname)
5683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                attrvalue = attrname
5693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif attrvalue[:1] == "'" == attrvalue[-1:] or \
5703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 attrvalue[:1] == '"' == attrvalue[-1:]:
5713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                attrvalue = attrvalue[1:-1]
5723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif not self.__accept_unquoted_attributes:
5733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("attribute `%s' value not quoted" % attrname)
5743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = xmlns.match(attrname)
5753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if res is not None:
5763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                # namespace declaration
5773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                ncname = res.group('ncname')
5783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                namespace[ncname or ''] = attrvalue or None
5793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not self.__use_namespaces:
5803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.__use_namespaces = len(self.stack)+1
5813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
5823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if '<' in attrvalue:
5833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("`<' illegal in attribute value")
5843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if attrname in attrdict:
5853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error("attribute `%s' specified twice" % attrname)
5863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrvalue = attrvalue.translate(attrtrans)
5873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrdict[attrname] = self.translate_references(attrvalue)
5883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return attrdict, namespace, i
5893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
5903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- handle starttag, return length or -1 if not terminated
5913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_starttag(self, i):
5923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
5933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # i points to start of tag
5943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        end = endbracketfind.match(rawdata, i+1)
5953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if end is None:
5963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
5973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        tag = starttagmatch.match(rawdata, i)
5983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if tag is None or tag.end(0) != end.end(0):
5993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('garbage in starttag')
6003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return end.end(0)
6013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        nstag = tagname = tag.group('tagname')
6023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__map_case:
6033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            nstag = tagname = nstag.lower()
6043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not self.__seen_starttag and self.__seen_doctype and \
6053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           tagname != self.__seen_doctype:
6063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('starttag does not match DOCTYPE')
6073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__seen_starttag and not self.stack:
6083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('multiple elements on top level')
6093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        k, j = tag.span('attrs')
6103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
6113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.stack.append((tagname, nsdict, nstag))
6123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__use_namespaces:
6133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = qname.match(tagname)
6143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
6153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            res = None
6163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is not None:
6173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            prefix, nstag = res.group('prefix', 'local')
6183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if prefix is None:
6193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                prefix = ''
6203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            ns = None
6213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for t, d, nst in self.stack:
6223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if prefix in d:
6233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    ns = d[prefix]
6243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if ns is None and prefix != '':
6253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                ns = self.__namespaces.get(prefix)
6263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if ns is not None:
6273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                nstag = ns + ' ' + nstag
6283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            elif prefix != '':
6293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                nstag = prefix + ':' + nstag # undo split
6303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.stack[-1] = tagname, nsdict, nstag
6313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # translate namespace of attributes
6323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        attrnamemap = {} # map from new name to old name (used for error reporting)
6333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for key in attrdict.keys():
6343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrnamemap[key] = key
6353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__use_namespaces:
6363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            nattrdict = {}
6373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for key, val in attrdict.items():
6383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                okey = key
6393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                res = qname.match(key)
6403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if res is not None:
6413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    aprefix, key = res.group('prefix', 'local')
6423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if self.__map_case:
6433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        key = key.lower()
6443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if aprefix is not None:
6453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        ans = None
6463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        for t, d, nst in self.stack:
6473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            if aprefix in d:
6483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                ans = d[aprefix]
6493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        if ans is None:
6503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            ans = self.__namespaces.get(aprefix)
6513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        if ans is not None:
6523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            key = ans + ' ' + key
6533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        else:
6543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            key = aprefix + ':' + key
6553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                nattrdict[key] = val
6563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                attrnamemap[key] = okey
6573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            attrdict = nattrdict
6583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        attributes = self.attributes.get(nstag)
6593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if attributes is not None:
6603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for key in attrdict.keys():
6613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not key in attributes:
6623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
6633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for key, val in attributes.items():
6643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if val is not None and not key in attrdict:
6653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    attrdict[key] = val
6663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        method = self.elements.get(nstag, (None, None))[0]
6673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.finish_starttag(nstag, attrdict, method)
6683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if tag.group('slash') == '/':
6693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.finish_endtag(tagname)
6703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return tag.end(0)
6713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
6723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- parse endtag
6733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def parse_endtag(self, i):
6743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rawdata = self.rawdata
6753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        end = endbracketfind.match(rawdata, i+1)
6763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if end is None:
6773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return -1
6783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        res = tagfind.match(rawdata, i+2)
6793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if res is None:
6803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.literal:
6813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.handle_data(rawdata[i])
6823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return i+1
6833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if not self.__accept_missing_endtag_name:
6843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('no name specified in end tag')
6853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            tag = self.stack[-1][0]
6863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            k = i+2
6873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
6883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            tag = res.group(0)
6893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.__map_case:
6903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                tag = tag.lower()
6913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.literal:
6923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if not self.stack or tag != self.stack[-1][0]:
6933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.handle_data(rawdata[i])
6943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    return i+1
6953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            k = res.end(0)
6963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if endbracket.match(rawdata, k) is None:
6973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('garbage in end tag')
6983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.finish_endtag(tag)
6993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return end.end(0)
7003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- finish processing of start tag
7023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def finish_starttag(self, tagname, attrdict, method):
7033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if method is not None:
7043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.handle_starttag(tagname, method, attrdict)
7053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
7063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.unknown_starttag(tagname, attrdict)
7073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Internal -- finish processing of end tag
7093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def finish_endtag(self, tag):
7103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.literal = 0
7113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not tag:
7123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.syntax_error('name-less end tag')
7133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            found = len(self.stack) - 1
7143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if found < 0:
7153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.unknown_endtag(tag)
7163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return
7173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
7183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            found = -1
7193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for i in range(len(self.stack)):
7203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if tag == self.stack[i][0]:
7213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    found = i
7223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if found == -1:
7233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('unopened end tag')
7243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return
7253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while len(self.stack) > found:
7263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if found < len(self.stack) - 1:
7273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.syntax_error('missing close tag for %s' % self.stack[-1][2])
7283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            nstag = self.stack[-1][2]
7293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            method = self.elements.get(nstag, (None, None))[1]
7303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if method is not None:
7313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.handle_endtag(nstag, method)
7323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
7333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.unknown_endtag(nstag)
7343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if self.__use_namespaces == len(self.stack):
7353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.__use_namespaces = 0
7363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            del self.stack[-1]
7373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Overridable -- handle xml processing instruction
7393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_xml(self, encoding, standalone):
7403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Overridable -- handle DOCTYPE
7433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_doctype(self, tag, pubid, syslit, data):
7443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Overridable -- handle start tag
7473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_starttag(self, tag, method, attrs):
7483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        method(attrs)
7493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Overridable -- handle end tag
7513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_endtag(self, tag, method):
7523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        method()
7533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle character reference, no need to override
7553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_charref(self, name):
7563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
7573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if name[0] == 'x':
7583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                n = int(name[1:], 16)
7593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
7603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                n = int(name)
7613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except ValueError:
7623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.unknown_charref(name)
7633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return
7643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not 0 <= n <= 255:
7653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.unknown_charref(name)
7663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return
7673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.handle_data(chr(n))
7683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Definition of entities -- derived classes may override
7703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    entitydefs = {'lt': '&#60;',        # must use charref
7713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  'gt': '&#62;',
7723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  'amp': '&#38;',       # must use charref
7733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  'quot': '&#34;',
7743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  'apos': '&#39;',
7753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  }
7763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle data, should be overridden
7783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_data(self, data):
7793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle cdata, could be overridden
7823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_cdata(self, data):
7833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle comment, could be overridden
7863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_comment(self, data):
7873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle processing instructions, could be overridden
7903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_proc(self, name, data):
7913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
7923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Example -- handle relatively harmless syntax errors, could be overridden
7943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def syntax_error(self, message):
7953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        raise Error('Syntax error at line %d: %s' % (self.lineno, message))
7963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
7973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # To be overridden -- handlers for unknown objects
7983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_starttag(self, tag, attrs): pass
7993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_endtag(self, tag): pass
8003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_charref(self, ref): pass
8013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_entityref(self, name):
8023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.syntax_error("reference to unknown entity `&%s;'" % name)
8033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass TestXMLParser(XMLParser):
8063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, **kw):
8083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.testdata = ""
8093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        XMLParser.__init__(self, **kw)
8103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_xml(self, encoding, standalone):
8123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'xml: encoding =',encoding,'standalone =',standalone
8143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_doctype(self, tag, pubid, syslit, data):
8163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'DOCTYPE:',tag, repr(data)
8183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_data(self, data):
8203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.testdata = self.testdata + data
8213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if len(repr(self.testdata)) >= 70:
8223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.flush()
8233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def flush(self):
8253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        data = self.testdata
8263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if data:
8273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.testdata = ""
8283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print 'data:', repr(data)
8293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_cdata(self, data):
8313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'cdata:', repr(data)
8333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_proc(self, name, data):
8353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'processing:',name,repr(data)
8373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def handle_comment(self, data):
8393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        r = repr(data)
8413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if len(r) > 68:
8423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            r = r[:32] + '...' + r[-32:]
8433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'comment:', r
8443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def syntax_error(self, message):
8463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'error at line %d:' % self.lineno, message
8473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_starttag(self, tag, attrs):
8493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not attrs:
8513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print 'start tag: <' + tag + '>'
8523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
8533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print 'start tag: <' + tag,
8543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for name, value in attrs.items():
8553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                print name + '=' + '"' + value + '"',
8563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print '>'
8573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_endtag(self, tag):
8593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'end tag: </' + tag + '>'
8613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_entityref(self, ref):
8633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print '*** unknown entity ref: &' + ref + ';'
8653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def unknown_charref(self, ref):
8673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print '*** unknown char ref: &#' + ref + ';'
8693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def close(self):
8713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        XMLParser.close(self)
8723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.flush()
8733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef test(args = None):
8753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    import sys, getopt
8763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    from time import time
8773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if not args:
8793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        args = sys.argv[1:]
8803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    opts, args = getopt.getopt(args, 'st')
8823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    klass = TestXMLParser
8833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    do_time = 0
8843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    for o, a in opts:
8853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if o == '-s':
8863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            klass = XMLParser
8873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        elif o == '-t':
8883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            do_time = 1
8893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if args:
8913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        file = args[0]
8923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    else:
8933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        file = 'test.xml'
8943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
8953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if file == '-':
8963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        f = sys.stdin
8973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    else:
8983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
8993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            f = open(file, 'r')
9003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except IOError, msg:
9013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print file, ":", msg
9023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            sys.exit(1)
9033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
9043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    data = f.read()
9053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if f is not sys.stdin:
9063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        f.close()
9073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
9083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    x = klass()
9093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    t0 = time()
9103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    try:
9113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if do_time:
9123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            x.feed(data)
9133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            x.close()
9143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
9153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for c in data:
9163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                x.feed(c)
9173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            x.close()
9183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    except Error, msg:
9193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        t1 = time()
9203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print msg
9213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if do_time:
9223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            print 'total time: %g' % (t1-t0)
9233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        sys.exit(1)
9243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    t1 = time()
9253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if do_time:
9263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        print 'total time: %g' % (t1-t0)
9273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
9283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
9293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielif __name__ == '__main__':
9303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    test()
931