1"""A parser for XML, using the derived class as static DTD."""
2
3# Author: Sjoerd Mullender.
4
5import re
6import string
7
8import warnings
9warnings.warn("The xmllib module is obsolete.  Use xml.sax instead.",
10              DeprecationWarning, 2)
11del warnings
12
13version = '0.3'
14
15class Error(RuntimeError):
16    pass
17
18# Regular expressions used for parsing
19
20_S = '[ \t\r\n]+'                       # white space
21_opS = '[ \t\r\n]*'                     # optional white space
22_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
23_QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
24illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
25interesting = re.compile('[]&<]')
26
27amp = re.compile('&')
28ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
29entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
30charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
31space = re.compile(_S + '$')
32newline = re.compile('\n')
33
34attrfind = re.compile(
35    _S + '(?P<name>' + _Name + ')'
36    '(' + _opS + '=' + _opS +
37    '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
38starttagopen = re.compile('<' + _Name)
39starttagend = re.compile(_opS + '(?P<slash>/?)>')
40starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
41                      '(?P<attrs>(?:'+attrfind.pattern+')*)'+
42                      starttagend.pattern)
43endtagopen = re.compile('</')
44endbracket = re.compile(_opS + '>')
45endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
46tagfind = re.compile(_Name)
47cdataopen = re.compile(r'<!\[CDATA\[')
48cdataclose = re.compile(r'\]\]>')
49# this matches one of the following:
50# SYSTEM SystemLiteral
51# PUBLIC PubidLiteral SystemLiteral
52_SystemLiteral = '(?P<%s>'+_QStr+')'
53_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
54                        "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
55_ExternalId = '(?:SYSTEM|' \
56                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
57              ')'+_S+_SystemLiteral%'syslit'
58doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
59                     '(?:'+_S+_ExternalId+')?'+_opS)
60xmldecl = re.compile('<\?xml'+_S+
61                     'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
62                     '(?:'+_S+'encoding'+_opS+'='+_opS+
63                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
64                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
65                     '(?:'+_S+'standalone'+_opS+'='+_opS+
66                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
67                     _opS+'\?>')
68procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
69procclose = re.compile(_opS + r'\?>')
70commentopen = re.compile('<!--')
71commentclose = re.compile('-->')
72doubledash = re.compile('--')
73attrtrans = string.maketrans(' \r\n\t', '    ')
74
75# definitions for XML namespaces
76_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
77ncname = re.compile(_NCName + '$')
78qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
79                   '(?P<local>' + _NCName + ')$')
80
81xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
82
83# XML parser base class -- find tags and call handler functions.
84# Usage: p = XMLParser(); p.feed(data); ...; p.close().
85# The dtd is defined by deriving a class which defines methods with
86# special names to handle tags: start_foo and end_foo to handle <foo>
87# and </foo>, respectively.  The data between tags is passed to the
88# parser by calling self.handle_data() with some data as argument (the
89# data may be split up in arbitrary chunks).
90
91class XMLParser:
92    attributes = {}                     # default, to be overridden
93    elements = {}                       # default, to be overridden
94
95    # parsing options, settable using keyword args in __init__
96    __accept_unquoted_attributes = 0
97    __accept_missing_endtag_name = 0
98    __map_case = 0
99    __accept_utf8 = 0
100    __translate_attribute_references = 1
101
102    # Interface -- initialize and reset this instance
103    def __init__(self, **kw):
104        self.__fixed = 0
105        if 'accept_unquoted_attributes' in kw:
106            self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
107        if 'accept_missing_endtag_name' in kw:
108            self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
109        if 'map_case' in kw:
110            self.__map_case = kw['map_case']
111        if 'accept_utf8' in kw:
112            self.__accept_utf8 = kw['accept_utf8']
113        if 'translate_attribute_references' in kw:
114            self.__translate_attribute_references = kw['translate_attribute_references']
115        self.reset()
116
117    def __fixelements(self):
118        self.__fixed = 1
119        self.elements = {}
120        self.__fixdict(self.__dict__)
121        self.__fixclass(self.__class__)
122
123    def __fixclass(self, kl):
124        self.__fixdict(kl.__dict__)
125        for k in kl.__bases__:
126            self.__fixclass(k)
127
128    def __fixdict(self, dict):
129        for key in dict.keys():
130            if key[:6] == 'start_':
131                tag = key[6:]
132                start, end = self.elements.get(tag, (None, None))
133                if start is None:
134                    self.elements[tag] = getattr(self, key), end
135            elif key[:4] == 'end_':
136                tag = key[4:]
137                start, end = self.elements.get(tag, (None, None))
138                if end is None:
139                    self.elements[tag] = start, getattr(self, key)
140
141    # Interface -- reset this instance.  Loses all unprocessed data
142    def reset(self):
143        self.rawdata = ''
144        self.stack = []
145        self.nomoretags = 0
146        self.literal = 0
147        self.lineno = 1
148        self.__at_start = 1
149        self.__seen_doctype = None
150        self.__seen_starttag = 0
151        self.__use_namespaces = 0
152        self.__namespaces = {'xml':None}   # xml is implicitly declared
153        # backward compatibility hack: if elements not overridden,
154        # fill it in ourselves
155        if self.elements is XMLParser.elements:
156            self.__fixelements()
157
158    # For derived classes only -- enter literal mode (CDATA) till EOF
159    def setnomoretags(self):
160        self.nomoretags = self.literal = 1
161
162    # For derived classes only -- enter literal mode (CDATA)
163    def setliteral(self, *args):
164        self.literal = 1
165
166    # Interface -- feed some data to the parser.  Call this as
167    # often as you want, with as little or as much text as you
168    # want (may include '\n').  (This just saves the text, all the
169    # processing is done by goahead().)
170    def feed(self, data):
171        self.rawdata = self.rawdata + data
172        self.goahead(0)
173
174    # Interface -- handle the remaining data
175    def close(self):
176        self.goahead(1)
177        if self.__fixed:
178            self.__fixed = 0
179            # remove self.elements so that we don't leak
180            del self.elements
181
182    # Interface -- translate references
183    def translate_references(self, data, all = 1):
184        if not self.__translate_attribute_references:
185            return data
186        i = 0
187        while 1:
188            res = amp.search(data, i)
189            if res is None:
190                return data
191            s = res.start(0)
192            res = ref.match(data, s)
193            if res is None:
194                self.syntax_error("bogus `&'")
195                i = s+1
196                continue
197            i = res.end(0)
198            str = res.group(1)
199            rescan = 0
200            if str[0] == '#':
201                if str[1] == 'x':
202                    str = chr(int(str[2:], 16))
203                else:
204                    str = chr(int(str[1:]))
205                if data[i - 1] != ';':
206                    self.syntax_error("`;' missing after char reference")
207                    i = i-1
208            elif all:
209                if str in self.entitydefs:
210                    str = self.entitydefs[str]
211                    rescan = 1
212                elif data[i - 1] != ';':
213                    self.syntax_error("bogus `&'")
214                    i = s + 1 # just past the &
215                    continue
216                else:
217                    self.syntax_error("reference to unknown entity `&%s;'" % str)
218                    str = '&' + str + ';'
219            elif data[i - 1] != ';':
220                self.syntax_error("bogus `&'")
221                i = s + 1 # just past the &
222                continue
223
224            # when we get here, str contains the translated text and i points
225            # to the end of the string that is to be replaced
226            data = data[:s] + str + data[i:]
227            if rescan:
228                i = s
229            else:
230                i = s + len(str)
231
232    # Interface - return a dictionary of all namespaces currently valid
233    def getnamespace(self):
234        nsdict = {}
235        for t, d, nst in self.stack:
236            nsdict.update(d)
237        return nsdict
238
239    # Internal -- handle data as far as reasonable.  May leave state
240    # and data to be processed by a subsequent call.  If 'end' is
241    # true, force handling all data as if followed by EOF marker.
242    def goahead(self, end):
243        rawdata = self.rawdata
244        i = 0
245        n = len(rawdata)
246        while i < n:
247            if i > 0:
248                self.__at_start = 0
249            if self.nomoretags:
250                data = rawdata[i:n]
251                self.handle_data(data)
252                self.lineno = self.lineno + data.count('\n')
253                i = n
254                break
255            res = interesting.search(rawdata, i)
256            if res:
257                j = res.start(0)
258            else:
259                j = n
260            if i < j:
261                data = rawdata[i:j]
262                if self.__at_start and space.match(data) is None:
263                    self.syntax_error('illegal data at start of file')
264                self.__at_start = 0
265                if not self.stack and space.match(data) is None:
266                    self.syntax_error('data not in content')
267                if not self.__accept_utf8 and illegal.search(data):
268                    self.syntax_error('illegal character in content')
269                self.handle_data(data)
270                self.lineno = self.lineno + data.count('\n')
271            i = j
272            if i == n: break
273            if rawdata[i] == '<':
274                if starttagopen.match(rawdata, i):
275                    if self.literal:
276                        data = rawdata[i]
277                        self.handle_data(data)
278                        self.lineno = self.lineno + data.count('\n')
279                        i = i+1
280                        continue
281                    k = self.parse_starttag(i)
282                    if k < 0: break
283                    self.__seen_starttag = 1
284                    self.lineno = self.lineno + rawdata[i:k].count('\n')
285                    i = k
286                    continue
287                if endtagopen.match(rawdata, i):
288                    k = self.parse_endtag(i)
289                    if k < 0: break
290                    self.lineno = self.lineno + rawdata[i:k].count('\n')
291                    i =  k
292                    continue
293                if commentopen.match(rawdata, i):
294                    if self.literal:
295                        data = rawdata[i]
296                        self.handle_data(data)
297                        self.lineno = self.lineno + data.count('\n')
298                        i = i+1
299                        continue
300                    k = self.parse_comment(i)
301                    if k < 0: break
302                    self.lineno = self.lineno + rawdata[i:k].count('\n')
303                    i = k
304                    continue
305                if cdataopen.match(rawdata, i):
306                    k = self.parse_cdata(i)
307                    if k < 0: break
308                    self.lineno = self.lineno + rawdata[i:k].count('\n')
309                    i = k
310                    continue
311                res = xmldecl.match(rawdata, i)
312                if res:
313                    if not self.__at_start:
314                        self.syntax_error("<?xml?> declaration not at start of document")
315                    version, encoding, standalone = res.group('version',
316                                                              'encoding',
317                                                              'standalone')
318                    if version[1:-1] != '1.0':
319                        raise Error('only XML version 1.0 supported')
320                    if encoding: encoding = encoding[1:-1]
321                    if standalone: standalone = standalone[1:-1]
322                    self.handle_xml(encoding, standalone)
323                    i = res.end(0)
324                    continue
325                res = procopen.match(rawdata, i)
326                if res:
327                    k = self.parse_proc(i)
328                    if k < 0: break
329                    self.lineno = self.lineno + rawdata[i:k].count('\n')
330                    i = k
331                    continue
332                res = doctype.match(rawdata, i)
333                if res:
334                    if self.literal:
335                        data = rawdata[i]
336                        self.handle_data(data)
337                        self.lineno = self.lineno + data.count('\n')
338                        i = i+1
339                        continue
340                    if self.__seen_doctype:
341                        self.syntax_error('multiple DOCTYPE elements')
342                    if self.__seen_starttag:
343                        self.syntax_error('DOCTYPE not at beginning of document')
344                    k = self.parse_doctype(res)
345                    if k < 0: break
346                    self.__seen_doctype = res.group('name')
347                    if self.__map_case:
348                        self.__seen_doctype = self.__seen_doctype.lower()
349                    self.lineno = self.lineno + rawdata[i:k].count('\n')
350                    i = k
351                    continue
352            elif rawdata[i] == '&':
353                if self.literal:
354                    data = rawdata[i]
355                    self.handle_data(data)
356                    i = i+1
357                    continue
358                res = charref.match(rawdata, i)
359                if res is not None:
360                    i = res.end(0)
361                    if rawdata[i-1] != ';':
362                        self.syntax_error("`;' missing in charref")
363                        i = i-1
364                    if not self.stack:
365                        self.syntax_error('data not in content')
366                    self.handle_charref(res.group('char')[:-1])
367                    self.lineno = self.lineno + res.group(0).count('\n')
368                    continue
369                res = entityref.match(rawdata, i)
370                if res is not None:
371                    i = res.end(0)
372                    if rawdata[i-1] != ';':
373                        self.syntax_error("`;' missing in entityref")
374                        i = i-1
375                    name = res.group('name')
376                    if self.__map_case:
377                        name = name.lower()
378                    if name in self.entitydefs:
379                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
380                        n = len(rawdata)
381                        i = res.start(0)
382                    else:
383                        self.unknown_entityref(name)
384                    self.lineno = self.lineno + res.group(0).count('\n')
385                    continue
386            elif rawdata[i] == ']':
387                if self.literal:
388                    data = rawdata[i]
389                    self.handle_data(data)
390                    i = i+1
391                    continue
392                if n-i < 3:
393                    break
394                if cdataclose.match(rawdata, i):
395                    self.syntax_error("bogus `]]>'")
396                self.handle_data(rawdata[i])
397                i = i+1
398                continue
399            else:
400                raise Error('neither < nor & ??')
401            # We get here only if incomplete matches but
402            # nothing else
403            break
404        # end while
405        if i > 0:
406            self.__at_start = 0
407        if end and i < n:
408            data = rawdata[i]
409            self.syntax_error("bogus `%s'" % data)
410            if not self.__accept_utf8 and illegal.search(data):
411                self.syntax_error('illegal character in content')
412            self.handle_data(data)
413            self.lineno = self.lineno + data.count('\n')
414            self.rawdata = rawdata[i+1:]
415            return self.goahead(end)
416        self.rawdata = rawdata[i:]
417        if end:
418            if not self.__seen_starttag:
419                self.syntax_error('no elements in file')
420            if self.stack:
421                self.syntax_error('missing end tags')
422                while self.stack:
423                    self.finish_endtag(self.stack[-1][0])
424
425    # Internal -- parse comment, return length or -1 if not terminated
426    def parse_comment(self, i):
427        rawdata = self.rawdata
428        if rawdata[i:i+4] != '<!--':
429            raise Error('unexpected call to handle_comment')
430        res = commentclose.search(rawdata, i+4)
431        if res is None:
432            return -1
433        if doubledash.search(rawdata, i+4, res.start(0)):
434            self.syntax_error("`--' inside comment")
435        if rawdata[res.start(0)-1] == '-':
436            self.syntax_error('comment cannot end in three dashes')
437        if not self.__accept_utf8 and \
438           illegal.search(rawdata, i+4, res.start(0)):
439            self.syntax_error('illegal character in comment')
440        self.handle_comment(rawdata[i+4: res.start(0)])
441        return res.end(0)
442
443    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
444    def parse_doctype(self, res):
445        rawdata = self.rawdata
446        n = len(rawdata)
447        name = res.group('name')
448        if self.__map_case:
449            name = name.lower()
450        pubid, syslit = res.group('pubid', 'syslit')
451        if pubid is not None:
452            pubid = pubid[1:-1]         # remove quotes
453            pubid = ' '.join(pubid.split()) # normalize
454        if syslit is not None: syslit = syslit[1:-1] # remove quotes
455        j = k = res.end(0)
456        if k >= n:
457            return -1
458        if rawdata[k] == '[':
459            level = 0
460            k = k+1
461            dq = sq = 0
462            while k < n:
463                c = rawdata[k]
464                if not sq and c == '"':
465                    dq = not dq
466                elif not dq and c == "'":
467                    sq = not sq
468                elif sq or dq:
469                    pass
470                elif level <= 0 and c == ']':
471                    res = endbracket.match(rawdata, k+1)
472                    if res is None:
473                        return -1
474                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
475                    return res.end(0)
476                elif c == '<':
477                    level = level + 1
478                elif c == '>':
479                    level = level - 1
480                    if level < 0:
481                        self.syntax_error("bogus `>' in DOCTYPE")
482                k = k+1
483        res = endbracketfind.match(rawdata, k)
484        if res is None:
485            return -1
486        if endbracket.match(rawdata, k) is None:
487            self.syntax_error('garbage in DOCTYPE')
488        self.handle_doctype(name, pubid, syslit, None)
489        return res.end(0)
490
491    # Internal -- handle CDATA tag, return length or -1 if not terminated
492    def parse_cdata(self, i):
493        rawdata = self.rawdata
494        if rawdata[i:i+9] != '<![CDATA[':
495            raise Error('unexpected call to parse_cdata')
496        res = cdataclose.search(rawdata, i+9)
497        if res is None:
498            return -1
499        if not self.__accept_utf8 and \
500           illegal.search(rawdata, i+9, res.start(0)):
501            self.syntax_error('illegal character in CDATA')
502        if not self.stack:
503            self.syntax_error('CDATA not in content')
504        self.handle_cdata(rawdata[i+9:res.start(0)])
505        return res.end(0)
506
507    __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
508    # Internal -- handle a processing instruction tag
509    def parse_proc(self, i):
510        rawdata = self.rawdata
511        end = procclose.search(rawdata, i)
512        if end is None:
513            return -1
514        j = end.start(0)
515        if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
516            self.syntax_error('illegal character in processing instruction')
517        res = tagfind.match(rawdata, i+2)
518        if res is None:
519            raise Error('unexpected call to parse_proc')
520        k = res.end(0)
521        name = res.group(0)
522        if self.__map_case:
523            name = name.lower()
524        if name == 'xml:namespace':
525            self.syntax_error('old-fashioned namespace declaration')
526            self.__use_namespaces = -1
527            # namespace declaration
528            # this must come after the <?xml?> declaration (if any)
529            # and before the <!DOCTYPE> (if any).
530            if self.__seen_doctype or self.__seen_starttag:
531                self.syntax_error('xml:namespace declaration too late in document')
532            attrdict, namespace, k = self.parse_attributes(name, k, j)
533            if namespace:
534                self.syntax_error('namespace declaration inside namespace declaration')
535            for attrname in attrdict.keys():
536                if not attrname in self.__xml_namespace_attributes:
537                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
538            if not 'ns' in attrdict or not 'prefix' in attrdict:
539                self.syntax_error('xml:namespace without required attributes')
540            prefix = attrdict.get('prefix')
541            if ncname.match(prefix) is None:
542                self.syntax_error('xml:namespace illegal prefix value')
543                return end.end(0)
544            if prefix in self.__namespaces:
545                self.syntax_error('xml:namespace prefix not unique')
546            self.__namespaces[prefix] = attrdict['ns']
547        else:
548            if name.lower() == 'xml':
549                self.syntax_error('illegal processing instruction target name')
550            self.handle_proc(name, rawdata[k:j])
551        return end.end(0)
552
553    # Internal -- parse attributes between i and j
554    def parse_attributes(self, tag, i, j):
555        rawdata = self.rawdata
556        attrdict = {}
557        namespace = {}
558        while i < j:
559            res = attrfind.match(rawdata, i)
560            if res is None:
561                break
562            attrname, attrvalue = res.group('name', 'value')
563            if self.__map_case:
564                attrname = attrname.lower()
565            i = res.end(0)
566            if attrvalue is None:
567                self.syntax_error("no value specified for attribute `%s'" % attrname)
568                attrvalue = attrname
569            elif attrvalue[:1] == "'" == attrvalue[-1:] or \
570                 attrvalue[:1] == '"' == attrvalue[-1:]:
571                attrvalue = attrvalue[1:-1]
572            elif not self.__accept_unquoted_attributes:
573                self.syntax_error("attribute `%s' value not quoted" % attrname)
574            res = xmlns.match(attrname)
575            if res is not None:
576                # namespace declaration
577                ncname = res.group('ncname')
578                namespace[ncname or ''] = attrvalue or None
579                if not self.__use_namespaces:
580                    self.__use_namespaces = len(self.stack)+1
581                continue
582            if '<' in attrvalue:
583                self.syntax_error("`<' illegal in attribute value")
584            if attrname in attrdict:
585                self.syntax_error("attribute `%s' specified twice" % attrname)
586            attrvalue = attrvalue.translate(attrtrans)
587            attrdict[attrname] = self.translate_references(attrvalue)
588        return attrdict, namespace, i
589
590    # Internal -- handle starttag, return length or -1 if not terminated
591    def parse_starttag(self, i):
592        rawdata = self.rawdata
593        # i points to start of tag
594        end = endbracketfind.match(rawdata, i+1)
595        if end is None:
596            return -1
597        tag = starttagmatch.match(rawdata, i)
598        if tag is None or tag.end(0) != end.end(0):
599            self.syntax_error('garbage in starttag')
600            return end.end(0)
601        nstag = tagname = tag.group('tagname')
602        if self.__map_case:
603            nstag = tagname = nstag.lower()
604        if not self.__seen_starttag and self.__seen_doctype and \
605           tagname != self.__seen_doctype:
606            self.syntax_error('starttag does not match DOCTYPE')
607        if self.__seen_starttag and not self.stack:
608            self.syntax_error('multiple elements on top level')
609        k, j = tag.span('attrs')
610        attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
611        self.stack.append((tagname, nsdict, nstag))
612        if self.__use_namespaces:
613            res = qname.match(tagname)
614        else:
615            res = None
616        if res is not None:
617            prefix, nstag = res.group('prefix', 'local')
618            if prefix is None:
619                prefix = ''
620            ns = None
621            for t, d, nst in self.stack:
622                if prefix in d:
623                    ns = d[prefix]
624            if ns is None and prefix != '':
625                ns = self.__namespaces.get(prefix)
626            if ns is not None:
627                nstag = ns + ' ' + nstag
628            elif prefix != '':
629                nstag = prefix + ':' + nstag # undo split
630            self.stack[-1] = tagname, nsdict, nstag
631        # translate namespace of attributes
632        attrnamemap = {} # map from new name to old name (used for error reporting)
633        for key in attrdict.keys():
634            attrnamemap[key] = key
635        if self.__use_namespaces:
636            nattrdict = {}
637            for key, val in attrdict.items():
638                okey = key
639                res = qname.match(key)
640                if res is not None:
641                    aprefix, key = res.group('prefix', 'local')
642                    if self.__map_case:
643                        key = key.lower()
644                    if aprefix is not None:
645                        ans = None
646                        for t, d, nst in self.stack:
647                            if aprefix in d:
648                                ans = d[aprefix]
649                        if ans is None:
650                            ans = self.__namespaces.get(aprefix)
651                        if ans is not None:
652                            key = ans + ' ' + key
653                        else:
654                            key = aprefix + ':' + key
655                nattrdict[key] = val
656                attrnamemap[key] = okey
657            attrdict = nattrdict
658        attributes = self.attributes.get(nstag)
659        if attributes is not None:
660            for key in attrdict.keys():
661                if not key in attributes:
662                    self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
663            for key, val in attributes.items():
664                if val is not None and not key in attrdict:
665                    attrdict[key] = val
666        method = self.elements.get(nstag, (None, None))[0]
667        self.finish_starttag(nstag, attrdict, method)
668        if tag.group('slash') == '/':
669            self.finish_endtag(tagname)
670        return tag.end(0)
671
672    # Internal -- parse endtag
673    def parse_endtag(self, i):
674        rawdata = self.rawdata
675        end = endbracketfind.match(rawdata, i+1)
676        if end is None:
677            return -1
678        res = tagfind.match(rawdata, i+2)
679        if res is None:
680            if self.literal:
681                self.handle_data(rawdata[i])
682                return i+1
683            if not self.__accept_missing_endtag_name:
684                self.syntax_error('no name specified in end tag')
685            tag = self.stack[-1][0]
686            k = i+2
687        else:
688            tag = res.group(0)
689            if self.__map_case:
690                tag = tag.lower()
691            if self.literal:
692                if not self.stack or tag != self.stack[-1][0]:
693                    self.handle_data(rawdata[i])
694                    return i+1
695            k = res.end(0)
696        if endbracket.match(rawdata, k) is None:
697            self.syntax_error('garbage in end tag')
698        self.finish_endtag(tag)
699        return end.end(0)
700
701    # Internal -- finish processing of start tag
702    def finish_starttag(self, tagname, attrdict, method):
703        if method is not None:
704            self.handle_starttag(tagname, method, attrdict)
705        else:
706            self.unknown_starttag(tagname, attrdict)
707
708    # Internal -- finish processing of end tag
709    def finish_endtag(self, tag):
710        self.literal = 0
711        if not tag:
712            self.syntax_error('name-less end tag')
713            found = len(self.stack) - 1
714            if found < 0:
715                self.unknown_endtag(tag)
716                return
717        else:
718            found = -1
719            for i in range(len(self.stack)):
720                if tag == self.stack[i][0]:
721                    found = i
722            if found == -1:
723                self.syntax_error('unopened end tag')
724                return
725        while len(self.stack) > found:
726            if found < len(self.stack) - 1:
727                self.syntax_error('missing close tag for %s' % self.stack[-1][2])
728            nstag = self.stack[-1][2]
729            method = self.elements.get(nstag, (None, None))[1]
730            if method is not None:
731                self.handle_endtag(nstag, method)
732            else:
733                self.unknown_endtag(nstag)
734            if self.__use_namespaces == len(self.stack):
735                self.__use_namespaces = 0
736            del self.stack[-1]
737
738    # Overridable -- handle xml processing instruction
739    def handle_xml(self, encoding, standalone):
740        pass
741
742    # Overridable -- handle DOCTYPE
743    def handle_doctype(self, tag, pubid, syslit, data):
744        pass
745
746    # Overridable -- handle start tag
747    def handle_starttag(self, tag, method, attrs):
748        method(attrs)
749
750    # Overridable -- handle end tag
751    def handle_endtag(self, tag, method):
752        method()
753
754    # Example -- handle character reference, no need to override
755    def handle_charref(self, name):
756        try:
757            if name[0] == 'x':
758                n = int(name[1:], 16)
759            else:
760                n = int(name)
761        except ValueError:
762            self.unknown_charref(name)
763            return
764        if not 0 <= n <= 255:
765            self.unknown_charref(name)
766            return
767        self.handle_data(chr(n))
768
769    # Definition of entities -- derived classes may override
770    entitydefs = {'lt': '&#60;',        # must use charref
771                  'gt': '&#62;',
772                  'amp': '&#38;',       # must use charref
773                  'quot': '&#34;',
774                  'apos': '&#39;',
775                  }
776
777    # Example -- handle data, should be overridden
778    def handle_data(self, data):
779        pass
780
781    # Example -- handle cdata, could be overridden
782    def handle_cdata(self, data):
783        pass
784
785    # Example -- handle comment, could be overridden
786    def handle_comment(self, data):
787        pass
788
789    # Example -- handle processing instructions, could be overridden
790    def handle_proc(self, name, data):
791        pass
792
793    # Example -- handle relatively harmless syntax errors, could be overridden
794    def syntax_error(self, message):
795        raise Error('Syntax error at line %d: %s' % (self.lineno, message))
796
797    # To be overridden -- handlers for unknown objects
798    def unknown_starttag(self, tag, attrs): pass
799    def unknown_endtag(self, tag): pass
800    def unknown_charref(self, ref): pass
801    def unknown_entityref(self, name):
802        self.syntax_error("reference to unknown entity `&%s;'" % name)
803
804
805class TestXMLParser(XMLParser):
806
807    def __init__(self, **kw):
808        self.testdata = ""
809        XMLParser.__init__(self, **kw)
810
811    def handle_xml(self, encoding, standalone):
812        self.flush()
813        print 'xml: encoding =',encoding,'standalone =',standalone
814
815    def handle_doctype(self, tag, pubid, syslit, data):
816        self.flush()
817        print 'DOCTYPE:',tag, repr(data)
818
819    def handle_data(self, data):
820        self.testdata = self.testdata + data
821        if len(repr(self.testdata)) >= 70:
822            self.flush()
823
824    def flush(self):
825        data = self.testdata
826        if data:
827            self.testdata = ""
828            print 'data:', repr(data)
829
830    def handle_cdata(self, data):
831        self.flush()
832        print 'cdata:', repr(data)
833
834    def handle_proc(self, name, data):
835        self.flush()
836        print 'processing:',name,repr(data)
837
838    def handle_comment(self, data):
839        self.flush()
840        r = repr(data)
841        if len(r) > 68:
842            r = r[:32] + '...' + r[-32:]
843        print 'comment:', r
844
845    def syntax_error(self, message):
846        print 'error at line %d:' % self.lineno, message
847
848    def unknown_starttag(self, tag, attrs):
849        self.flush()
850        if not attrs:
851            print 'start tag: <' + tag + '>'
852        else:
853            print 'start tag: <' + tag,
854            for name, value in attrs.items():
855                print name + '=' + '"' + value + '"',
856            print '>'
857
858    def unknown_endtag(self, tag):
859        self.flush()
860        print 'end tag: </' + tag + '>'
861
862    def unknown_entityref(self, ref):
863        self.flush()
864        print '*** unknown entity ref: &' + ref + ';'
865
866    def unknown_charref(self, ref):
867        self.flush()
868        print '*** unknown char ref: &#' + ref + ';'
869
870    def close(self):
871        XMLParser.close(self)
872        self.flush()
873
874def test(args = None):
875    import sys, getopt
876    from time import time
877
878    if not args:
879        args = sys.argv[1:]
880
881    opts, args = getopt.getopt(args, 'st')
882    klass = TestXMLParser
883    do_time = 0
884    for o, a in opts:
885        if o == '-s':
886            klass = XMLParser
887        elif o == '-t':
888            do_time = 1
889
890    if args:
891        file = args[0]
892    else:
893        file = 'test.xml'
894
895    if file == '-':
896        f = sys.stdin
897    else:
898        try:
899            f = open(file, 'r')
900        except IOError, msg:
901            print file, ":", msg
902            sys.exit(1)
903
904    data = f.read()
905    if f is not sys.stdin:
906        f.close()
907
908    x = klass()
909    t0 = time()
910    try:
911        if do_time:
912            x.feed(data)
913            x.close()
914        else:
915            for c in data:
916                x.feed(c)
917            x.close()
918    except Error, msg:
919        t1 = time()
920        print msg
921        if do_time:
922            print 'total time: %g' % (t1-t0)
923        sys.exit(1)
924    t1 = time()
925    if do_time:
926        print 'total time: %g' % (t1-t0)
927
928
929if __name__ == '__main__':
930    test()
931