1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import markupbase
12import re
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17incomplete = re.compile('&[a-zA-Z#]')
18
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21
22starttagopen = re.compile('<[a-zA-Z]')
23piclose = re.compile('>')
24commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
26# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
29
30attrfind = re.compile(
31    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
32    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
33
34locatestarttagend = re.compile(r"""
35  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
36  (?:[\s/]*                          # optional whitespace before attribute name
37    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
38      (?:\s*=+\s*                    # value indicator
39        (?:'[^']*'                   # LITA-enclosed value
40          |"[^"]*"                   # LIT-enclosed value
41          |(?!['"])[^>\s]*           # bare value
42         )
43       )?(?:\s|/(?!>))*
44     )*
45   )?
46  \s*                                # trailing whitespace
47""", re.VERBOSE)
48endendtag = re.compile('>')
49# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
50# </ and the tag name, so maybe this should be fixed
51endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
52
53
54class HTMLParseError(Exception):
55    """Exception raised for all parse errors."""
56
57    def __init__(self, msg, position=(None, None)):
58        assert msg
59        self.msg = msg
60        self.lineno = position[0]
61        self.offset = position[1]
62
63    def __str__(self):
64        result = self.msg
65        if self.lineno is not None:
66            result = result + ", at line %d" % self.lineno
67        if self.offset is not None:
68            result = result + ", column %d" % (self.offset + 1)
69        return result
70
71
72class HTMLParser(markupbase.ParserBase):
73    """Find tags and other markup and call handler functions.
74
75    Usage:
76        p = HTMLParser()
77        p.feed(data)
78        ...
79        p.close()
80
81    Start tags are handled by calling self.handle_starttag() or
82    self.handle_startendtag(); end tags by self.handle_endtag().  The
83    data between tags is passed from the parser to the derived class
84    by calling self.handle_data() with the data as argument (the data
85    may be split up in arbitrary chunks).  Entity references are
86    passed by calling self.handle_entityref() with the entity
87    reference as the argument.  Numeric character references are
88    passed to self.handle_charref() with the string containing the
89    reference as the argument.
90    """
91
92    CDATA_CONTENT_ELEMENTS = ("script", "style")
93
94
95    def __init__(self):
96        """Initialize and reset this instance."""
97        self.reset()
98
99    def reset(self):
100        """Reset this instance.  Loses all unprocessed data."""
101        self.rawdata = ''
102        self.lasttag = '???'
103        self.interesting = interesting_normal
104        self.cdata_elem = None
105        markupbase.ParserBase.reset(self)
106
107    def feed(self, data):
108        r"""Feed data to the parser.
109
110        Call this as often as you want, with as little or as much text
111        as you want (may include '\n').
112        """
113        self.rawdata = self.rawdata + data
114        self.goahead(0)
115
116    def close(self):
117        """Handle any buffered data."""
118        self.goahead(1)
119
120    def error(self, message):
121        raise HTMLParseError(message, self.getpos())
122
123    __starttag_text = None
124
125    def get_starttag_text(self):
126        """Return full source of start tag: '<...>'."""
127        return self.__starttag_text
128
129    def set_cdata_mode(self, elem):
130        self.cdata_elem = elem.lower()
131        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
132
133    def clear_cdata_mode(self):
134        self.interesting = interesting_normal
135        self.cdata_elem = None
136
137    # Internal -- handle data as far as reasonable.  May leave state
138    # and data to be processed by a subsequent call.  If 'end' is
139    # true, force handling all data as if followed by EOF marker.
140    def goahead(self, end):
141        rawdata = self.rawdata
142        i = 0
143        n = len(rawdata)
144        while i < n:
145            match = self.interesting.search(rawdata, i) # < or &
146            if match:
147                j = match.start()
148            else:
149                if self.cdata_elem:
150                    break
151                j = n
152            if i < j: self.handle_data(rawdata[i:j])
153            i = self.updatepos(i, j)
154            if i == n: break
155            startswith = rawdata.startswith
156            if startswith('<', i):
157                if starttagopen.match(rawdata, i): # < + letter
158                    k = self.parse_starttag(i)
159                elif startswith("</", i):
160                    k = self.parse_endtag(i)
161                elif startswith("<!--", i):
162                    k = self.parse_comment(i)
163                elif startswith("<?", i):
164                    k = self.parse_pi(i)
165                elif startswith("<!", i):
166                    k = self.parse_html_declaration(i)
167                elif (i + 1) < n:
168                    self.handle_data("<")
169                    k = i + 1
170                else:
171                    break
172                if k < 0:
173                    if not end:
174                        break
175                    k = rawdata.find('>', i + 1)
176                    if k < 0:
177                        k = rawdata.find('<', i + 1)
178                        if k < 0:
179                            k = i + 1
180                    else:
181                        k += 1
182                    self.handle_data(rawdata[i:k])
183                i = self.updatepos(i, k)
184            elif startswith("&#", i):
185                match = charref.match(rawdata, i)
186                if match:
187                    name = match.group()[2:-1]
188                    self.handle_charref(name)
189                    k = match.end()
190                    if not startswith(';', k-1):
191                        k = k - 1
192                    i = self.updatepos(i, k)
193                    continue
194                else:
195                    if ";" in rawdata[i:]: #bail by consuming &#
196                        self.handle_data(rawdata[0:2])
197                        i = self.updatepos(i, 2)
198                    break
199            elif startswith('&', i):
200                match = entityref.match(rawdata, i)
201                if match:
202                    name = match.group(1)
203                    self.handle_entityref(name)
204                    k = match.end()
205                    if not startswith(';', k-1):
206                        k = k - 1
207                    i = self.updatepos(i, k)
208                    continue
209                match = incomplete.match(rawdata, i)
210                if match:
211                    # match.group() will contain at least 2 chars
212                    if end and match.group() == rawdata[i:]:
213                        self.error("EOF in middle of entity or char ref")
214                    # incomplete
215                    break
216                elif (i + 1) < n:
217                    # not the end of the buffer, and can't be confused
218                    # with some other construct
219                    self.handle_data("&")
220                    i = self.updatepos(i, i + 1)
221                else:
222                    break
223            else:
224                assert 0, "interesting.search() lied"
225        # end while
226        if end and i < n and not self.cdata_elem:
227            self.handle_data(rawdata[i:n])
228            i = self.updatepos(i, n)
229        self.rawdata = rawdata[i:]
230
231    # Internal -- parse html declarations, return length or -1 if not terminated
232    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
233    # See also parse_declaration in _markupbase
234    def parse_html_declaration(self, i):
235        rawdata = self.rawdata
236        if rawdata[i:i+2] != '<!':
237            self.error('unexpected call to parse_html_declaration()')
238        if rawdata[i:i+4] == '<!--':
239            # this case is actually already handled in goahead()
240            return self.parse_comment(i)
241        elif rawdata[i:i+3] == '<![':
242            return self.parse_marked_section(i)
243        elif rawdata[i:i+9].lower() == '<!doctype':
244            # find the closing >
245            gtpos = rawdata.find('>', i+9)
246            if gtpos == -1:
247                return -1
248            self.handle_decl(rawdata[i+2:gtpos])
249            return gtpos+1
250        else:
251            return self.parse_bogus_comment(i)
252
253    # Internal -- parse bogus comment, return length or -1 if not terminated
254    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
255    def parse_bogus_comment(self, i, report=1):
256        rawdata = self.rawdata
257        if rawdata[i:i+2] not in ('<!', '</'):
258            self.error('unexpected call to parse_comment()')
259        pos = rawdata.find('>', i+2)
260        if pos == -1:
261            return -1
262        if report:
263            self.handle_comment(rawdata[i+2:pos])
264        return pos + 1
265
266    # Internal -- parse processing instr, return end or -1 if not terminated
267    def parse_pi(self, i):
268        rawdata = self.rawdata
269        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
270        match = piclose.search(rawdata, i+2) # >
271        if not match:
272            return -1
273        j = match.start()
274        self.handle_pi(rawdata[i+2: j])
275        j = match.end()
276        return j
277
278    # Internal -- handle starttag, return end or -1 if not terminated
279    def parse_starttag(self, i):
280        self.__starttag_text = None
281        endpos = self.check_for_whole_start_tag(i)
282        if endpos < 0:
283            return endpos
284        rawdata = self.rawdata
285        self.__starttag_text = rawdata[i:endpos]
286
287        # Now parse the data between i+1 and j into a tag and attrs
288        attrs = []
289        match = tagfind.match(rawdata, i+1)
290        assert match, 'unexpected call to parse_starttag()'
291        k = match.end()
292        self.lasttag = tag = match.group(1).lower()
293
294        while k < endpos:
295            m = attrfind.match(rawdata, k)
296            if not m:
297                break
298            attrname, rest, attrvalue = m.group(1, 2, 3)
299            if not rest:
300                attrvalue = None
301            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
302                 attrvalue[:1] == '"' == attrvalue[-1:]:
303                attrvalue = attrvalue[1:-1]
304            if attrvalue:
305                attrvalue = self.unescape(attrvalue)
306            attrs.append((attrname.lower(), attrvalue))
307            k = m.end()
308
309        end = rawdata[k:endpos].strip()
310        if end not in (">", "/>"):
311            lineno, offset = self.getpos()
312            if "\n" in self.__starttag_text:
313                lineno = lineno + self.__starttag_text.count("\n")
314                offset = len(self.__starttag_text) \
315                         - self.__starttag_text.rfind("\n")
316            else:
317                offset = offset + len(self.__starttag_text)
318            self.handle_data(rawdata[i:endpos])
319            return endpos
320        if end.endswith('/>'):
321            # XHTML-style empty tag: <span attr="value" />
322            self.handle_startendtag(tag, attrs)
323        else:
324            self.handle_starttag(tag, attrs)
325            if tag in self.CDATA_CONTENT_ELEMENTS:
326                self.set_cdata_mode(tag)
327        return endpos
328
329    # Internal -- check to see if we have a complete starttag; return end
330    # or -1 if incomplete.
331    def check_for_whole_start_tag(self, i):
332        rawdata = self.rawdata
333        m = locatestarttagend.match(rawdata, i)
334        if m:
335            j = m.end()
336            next = rawdata[j:j+1]
337            if next == ">":
338                return j + 1
339            if next == "/":
340                if rawdata.startswith("/>", j):
341                    return j + 2
342                if rawdata.startswith("/", j):
343                    # buffer boundary
344                    return -1
345                # else bogus input
346                self.updatepos(i, j + 1)
347                self.error("malformed empty start tag")
348            if next == "":
349                # end of input
350                return -1
351            if next in ("abcdefghijklmnopqrstuvwxyz=/"
352                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
353                # end of input in or before attribute value, or we have the
354                # '/' from a '/>' ending
355                return -1
356            if j > i:
357                return j
358            else:
359                return i + 1
360        raise AssertionError("we should not get here!")
361
362    # Internal -- parse endtag, return end or -1 if incomplete
363    def parse_endtag(self, i):
364        rawdata = self.rawdata
365        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
366        match = endendtag.search(rawdata, i+1) # >
367        if not match:
368            return -1
369        gtpos = match.end()
370        match = endtagfind.match(rawdata, i) # </ + tag + >
371        if not match:
372            if self.cdata_elem is not None:
373                self.handle_data(rawdata[i:gtpos])
374                return gtpos
375            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
376            namematch = tagfind_tolerant.match(rawdata, i+2)
377            if not namematch:
378                # w3.org/TR/html5/tokenization.html#end-tag-open-state
379                if rawdata[i:i+3] == '</>':
380                    return i+3
381                else:
382                    return self.parse_bogus_comment(i)
383            tagname = namematch.group().lower()
384            # consume and ignore other stuff between the name and the >
385            # Note: this is not 100% correct, since we might have things like
386            # </tag attr=">">, but looking for > after tha name should cover
387            # most of the cases and is much simpler
388            gtpos = rawdata.find('>', namematch.end())
389            self.handle_endtag(tagname)
390            return gtpos+1
391
392        elem = match.group(1).lower() # script or style
393        if self.cdata_elem is not None:
394            if elem != self.cdata_elem:
395                self.handle_data(rawdata[i:gtpos])
396                return gtpos
397
398        self.handle_endtag(elem)
399        self.clear_cdata_mode()
400        return gtpos
401
402    # Overridable -- finish processing of start+end tag: <tag.../>
403    def handle_startendtag(self, tag, attrs):
404        self.handle_starttag(tag, attrs)
405        self.handle_endtag(tag)
406
407    # Overridable -- handle start tag
408    def handle_starttag(self, tag, attrs):
409        pass
410
411    # Overridable -- handle end tag
412    def handle_endtag(self, tag):
413        pass
414
415    # Overridable -- handle character reference
416    def handle_charref(self, name):
417        pass
418
419    # Overridable -- handle entity reference
420    def handle_entityref(self, name):
421        pass
422
423    # Overridable -- handle data
424    def handle_data(self, data):
425        pass
426
427    # Overridable -- handle comment
428    def handle_comment(self, data):
429        pass
430
431    # Overridable -- handle declaration
432    def handle_decl(self, decl):
433        pass
434
435    # Overridable -- handle processing instruction
436    def handle_pi(self, data):
437        pass
438
439    def unknown_decl(self, data):
440        pass
441
442    # Internal -- helper to remove special character quoting
443    entitydefs = None
444    def unescape(self, s):
445        if '&' not in s:
446            return s
447        def replaceEntities(s):
448            s = s.groups()[0]
449            try:
450                if s[0] == "#":
451                    s = s[1:]
452                    if s[0] in ['x','X']:
453                        c = int(s[1:], 16)
454                    else:
455                        c = int(s)
456                    return unichr(c)
457            except ValueError:
458                return '&#'+s+';'
459            else:
460                # Cannot use name2codepoint directly, because HTMLParser supports apos,
461                # which is not part of HTML 4
462                import htmlentitydefs
463                if HTMLParser.entitydefs is None:
464                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
465                    for k, v in htmlentitydefs.name2codepoint.iteritems():
466                        entitydefs[k] = unichr(v)
467                try:
468                    return self.entitydefs[s]
469                except KeyError:
470                    return '&'+s+';'
471
472        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
473