14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""A parser for HTML and XHTML."""
24adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# This file is based on sgmllib.py, but the API is slightly different.
44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed
64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# character data -- the normal case), RCDATA (replaceable character
74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# data -- only char and entity references and end tags are special)
84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# and CDATA (character data -- only end tags are special).
94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport markupbase
124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport re
134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Regular expressions used for parsing
154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaointeresting_normal = re.compile('[&<]')
174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoincomplete = re.compile('&[a-zA-Z#]')
184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaocharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaostarttagopen = re.compile('<[a-zA-Z]')
234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaopiclose = re.compile('>')
244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaocommentclose = re.compile(r'--\s*>')
254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoattrfind = re.compile(
314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaolocatestarttagend = re.compile(r"""
354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao  (?:[\s/]*                          # optional whitespace before attribute name
374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao      (?:\s*=+\s*                    # value indicator
394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        (?:'[^']*'                   # LITA-enclosed value
404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          |"[^"]*"                   # LIT-enclosed value
414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          |(?!['"])[^>\s]*           # bare value
424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao         )
434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao       )?(?:\s|/(?!>))*
444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao     )*
454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao   )?
464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao  \s*                                # trailing whitespace
474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao""", re.VERBOSE)
484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendendtag = re.compile('>')
494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# </ and the tag name, so maybe this should be fixed
514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass HTMLParseError(Exception):
554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """Exception raised for all parse errors."""
564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self, msg, position=(None, None)):
584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert msg
594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.msg = msg
604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.lineno = position[0]
614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.offset = position[1]
624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __str__(self):
644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        result = self.msg
654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if self.lineno is not None:
664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            result = result + ", at line %d" % self.lineno
674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if self.offset is not None:
684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            result = result + ", column %d" % (self.offset + 1)
694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return result
704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass HTMLParser(markupbase.ParserBase):
734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """Find tags and other markup and call handler functions.
744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Usage:
764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        p = HTMLParser()
774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        p.feed(data)
784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        ...
794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        p.close()
804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Start tags are handled by calling self.handle_starttag() or
824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    self.handle_startendtag(); end tags by self.handle_endtag().  The
834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    data between tags is passed from the parser to the derived class
844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    by calling self.handle_data() with the data as argument (the data
854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    may be split up in arbitrary chunks).  Entity references are
864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    passed by calling self.handle_entityref() with the entity
874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    reference as the argument.  Numeric character references are
884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    passed to self.handle_charref() with the string containing the
894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    reference as the argument.
904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    CDATA_CONTENT_ELEMENTS = ("script", "style")
934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self):
964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        """Initialize and reset this instance."""
974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.reset()
984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def reset(self):
1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        """Reset this instance.  Loses all unprocessed data."""
1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.rawdata = ''
1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.lasttag = '???'
1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.interesting = interesting_normal
1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.cdata_elem = None
1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        markupbase.ParserBase.reset(self)
1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def feed(self, data):
1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        r"""Feed data to the parser.
1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        Call this as often as you want, with as little or as much text
1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        as you want (may include '\n').
1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        """
1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.rawdata = self.rawdata + data
1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.goahead(0)
1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def close(self):
1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        """Handle any buffered data."""
1184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.goahead(1)
1194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def error(self, message):
1214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        raise HTMLParseError(message, self.getpos())
1224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    __starttag_text = None
1244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def get_starttag_text(self):
1264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        """Return full source of start tag: '<...>'."""
1274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return self.__starttag_text
1284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def set_cdata_mode(self, elem):
1304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.cdata_elem = elem.lower()
1314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
1324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def clear_cdata_mode(self):
1344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.interesting = interesting_normal
1354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.cdata_elem = None
1364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- handle data as far as reasonable.  May leave state
1384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # and data to be processed by a subsequent call.  If 'end' is
1394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # true, force handling all data as if followed by EOF marker.
1404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def goahead(self, end):
1414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
1424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        i = 0
1434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        n = len(rawdata)
1444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        while i < n:
1454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            match = self.interesting.search(rawdata, i) # < or &
1464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if match:
1474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                j = match.start()
1484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
1494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if self.cdata_elem:
1504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
1514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                j = n
1524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if i < j: self.handle_data(rawdata[i:j])
1534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            i = self.updatepos(i, j)
1544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if i == n: break
1554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            startswith = rawdata.startswith
1564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if startswith('<', i):
1574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if starttagopen.match(rawdata, i): # < + letter
1584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = self.parse_starttag(i)
1594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif startswith("</", i):
1604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = self.parse_endtag(i)
1614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif startswith("<!--", i):
1624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = self.parse_comment(i)
1634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif startswith("<?", i):
1644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = self.parse_pi(i)
1654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif startswith("<!", i):
1664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = self.parse_html_declaration(i)
1674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif (i + 1) < n:
1684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.handle_data("<")
1694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = i + 1
1704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
1714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
1724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if k < 0:
1734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if not end:
1744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        break
1754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = rawdata.find('>', i + 1)
1764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if k < 0:
1774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        k = rawdata.find('<', i + 1)
1784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        if k < 0:
1794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                            k = i + 1
1804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:
1814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        k += 1
1824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.handle_data(rawdata[i:k])
1834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                i = self.updatepos(i, k)
1844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif startswith("&#", i):
1854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                match = charref.match(rawdata, i)
1864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if match:
1874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    name = match.group()[2:-1]
1884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.handle_charref(name)
1894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = match.end()
1904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if not startswith(';', k-1):
1914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        k = k - 1
1924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    i = self.updatepos(i, k)
1934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    continue
1944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
1954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if ";" in rawdata[i:]: #bail by consuming &#
1964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        self.handle_data(rawdata[0:2])
1974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        i = self.updatepos(i, 2)
1984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
1994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif startswith('&', i):
2004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                match = entityref.match(rawdata, i)
2014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if match:
2024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    name = match.group(1)
2034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.handle_entityref(name)
2044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    k = match.end()
2054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if not startswith(';', k-1):
2064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        k = k - 1
2074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    i = self.updatepos(i, k)
2084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    continue
2094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                match = incomplete.match(rawdata, i)
2104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if match:
2114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # match.group() will contain at least 2 chars
2124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if end and match.group() == rawdata[i:]:
2134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        self.error("EOF in middle of entity or char ref")
2144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # incomplete
2154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
2164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif (i + 1) < n:
2174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # not the end of the buffer, and can't be confused
2184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # with some other construct
2194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.handle_data("&")
2204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    i = self.updatepos(i, i + 1)
2214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
2224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
2234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
2244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                assert 0, "interesting.search() lied"
2254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        # end while
2264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if end and i < n and not self.cdata_elem:
2274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_data(rawdata[i:n])
2284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            i = self.updatepos(i, n)
2294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.rawdata = rawdata[i:]
2304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- parse html declarations, return length or -1 if not terminated
2324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
2334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # See also parse_declaration in _markupbase
2344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def parse_html_declaration(self, i):
2354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
2364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if rawdata[i:i+2] != '<!':
2374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.error('unexpected call to parse_html_declaration()')
2384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if rawdata[i:i+4] == '<!--':
2394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # this case is actually already handled in goahead()
2404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return self.parse_comment(i)
2414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elif rawdata[i:i+3] == '<![':
2424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return self.parse_marked_section(i)
2434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elif rawdata[i:i+9].lower() == '<!doctype':
2444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # find the closing >
2454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            gtpos = rawdata.find('>', i+9)
2464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if gtpos == -1:
2474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return -1
2484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_decl(rawdata[i+2:gtpos])
2494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return gtpos+1
2504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        else:
2514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return self.parse_bogus_comment(i)
2524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- parse bogus comment, return length or -1 if not terminated
2544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
2554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def parse_bogus_comment(self, i, report=1):
2564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
2574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if rawdata[i:i+2] not in ('<!', '</'):
2584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.error('unexpected call to parse_comment()')
2594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pos = rawdata.find('>', i+2)
2604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if pos == -1:
2614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return -1
2624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if report:
2634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_comment(rawdata[i+2:pos])
2644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return pos + 1
2654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- parse processing instr, return end or -1 if not terminated
2674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def parse_pi(self, i):
2684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
2694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
2704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        match = piclose.search(rawdata, i+2) # >
2714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if not match:
2724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return -1
2734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        j = match.start()
2744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.handle_pi(rawdata[i+2: j])
2754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        j = match.end()
2764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return j
2774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- handle starttag, return end or -1 if not terminated
2794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def parse_starttag(self, i):
2804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.__starttag_text = None
2814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        endpos = self.check_for_whole_start_tag(i)
2824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if endpos < 0:
2834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return endpos
2844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
2854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.__starttag_text = rawdata[i:endpos]
2864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        # Now parse the data between i+1 and j into a tag and attrs
2884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        attrs = []
2894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        match = tagfind.match(rawdata, i+1)
2904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert match, 'unexpected call to parse_starttag()'
2914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        k = match.end()
2924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.lasttag = tag = match.group(1).lower()
2934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        while k < endpos:
2954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            m = attrfind.match(rawdata, k)
2964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not m:
2974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                break
2984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            attrname, rest, attrvalue = m.group(1, 2, 3)
2994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not rest:
3004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                attrvalue = None
3014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
3024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 attrvalue[:1] == '"' == attrvalue[-1:]:
3034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                attrvalue = attrvalue[1:-1]
3044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if attrvalue:
3054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                attrvalue = self.unescape(attrvalue)
3064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            attrs.append((attrname.lower(), attrvalue))
3074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            k = m.end()
3084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        end = rawdata[k:endpos].strip()
3104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if end not in (">", "/>"):
3114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            lineno, offset = self.getpos()
3124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if "\n" in self.__starttag_text:
3134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                lineno = lineno + self.__starttag_text.count("\n")
3144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                offset = len(self.__starttag_text) \
3154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                         - self.__starttag_text.rfind("\n")
3164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
3174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                offset = offset + len(self.__starttag_text)
3184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_data(rawdata[i:endpos])
3194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return endpos
3204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if end.endswith('/>'):
3214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # XHTML-style empty tag: <span attr="value" />
3224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_startendtag(tag, attrs)
3234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        else:
3244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_starttag(tag, attrs)
3254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if tag in self.CDATA_CONTENT_ELEMENTS:
3264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.set_cdata_mode(tag)
3274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return endpos
3284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- check to see if we have a complete starttag; return end
3304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # or -1 if incomplete.
3314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def check_for_whole_start_tag(self, i):
3324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
3334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        m = locatestarttagend.match(rawdata, i)
3344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if m:
3354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            j = m.end()
3364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            next = rawdata[j:j+1]
3374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if next == ">":
3384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return j + 1
3394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if next == "/":
3404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if rawdata.startswith("/>", j):
3414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return j + 2
3424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if rawdata.startswith("/", j):
3434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # buffer boundary
3444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return -1
3454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # else bogus input
3464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.updatepos(i, j + 1)
3474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.error("malformed empty start tag")
3484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if next == "":
3494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # end of input
3504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return -1
3514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if next in ("abcdefghijklmnopqrstuvwxyz=/"
3524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
3534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # end of input in or before attribute value, or we have the
3544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # '/' from a '/>' ending
3554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return -1
3564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if j > i:
3574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return j
3584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
3594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return i + 1
3604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        raise AssertionError("we should not get here!")
3614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- parse endtag, return end or -1 if incomplete
3634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def parse_endtag(self, i):
3644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        rawdata = self.rawdata
3654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
3664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        match = endendtag.search(rawdata, i+1) # >
3674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if not match:
3684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return -1
3694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        gtpos = match.end()
3704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        match = endtagfind.match(rawdata, i) # </ + tag + >
3714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if not match:
3724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if self.cdata_elem is not None:
3734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.handle_data(rawdata[i:gtpos])
3744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return gtpos
3754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
3764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            namematch = tagfind_tolerant.match(rawdata, i+2)
3774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not namematch:
3784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # w3.org/TR/html5/tokenization.html#end-tag-open-state
3794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if rawdata[i:i+3] == '</>':
3804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return i+3
3814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
3824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return self.parse_bogus_comment(i)
3834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            tagname = namematch.group().lower()
3844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # consume and ignore other stuff between the name and the >
3854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # Note: this is not 100% correct, since we might have things like
3864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # </tag attr=">">, but looking for > after tha name should cover
3874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # most of the cases and is much simpler
3884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            gtpos = rawdata.find('>', namematch.end())
3894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.handle_endtag(tagname)
3904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return gtpos+1
3914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elem = match.group(1).lower() # script or style
3934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if self.cdata_elem is not None:
3944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if elem != self.cdata_elem:
3954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.handle_data(rawdata[i:gtpos])
3964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return gtpos
3974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.handle_endtag(elem)
3994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.clear_cdata_mode()
4004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return gtpos
4014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- finish processing of start+end tag: <tag.../>
4034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_startendtag(self, tag, attrs):
4044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.handle_starttag(tag, attrs)
4054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.handle_endtag(tag)
4064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle start tag
4084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_starttag(self, tag, attrs):
4094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle end tag
4124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_endtag(self, tag):
4134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle character reference
4164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_charref(self, name):
4174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle entity reference
4204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_entityref(self, name):
4214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle data
4244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_data(self, data):
4254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle comment
4284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_comment(self, data):
4294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle declaration
4324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_decl(self, decl):
4334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Overridable -- handle processing instruction
4364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def handle_pi(self, data):
4374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def unknown_decl(self, data):
4404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
4414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Internal -- helper to remove special character quoting
4434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    entitydefs = None
4444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def unescape(self, s):
4454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if '&' not in s:
4464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return s
4474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        def replaceEntities(s):
4484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            s = s.groups()[0]
4494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            try:
4504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if s[0] == "#":
4514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    s = s[1:]
4524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if s[0] in ['x','X']:
4534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        c = int(s[1:], 16)
4544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:
4554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        c = int(s)
4564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return unichr(c)
4574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            except ValueError:
4584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return '&#'+s+';'
4594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
4604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # Cannot use name2codepoint directly, because HTMLParser supports apos,
4614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # which is not part of HTML 4
4624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                import htmlentitydefs
4634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if HTMLParser.entitydefs is None:
4644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
4654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    for k, v in htmlentitydefs.name2codepoint.iteritems():
4664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        entitydefs[k] = unichr(v)
4674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                try:
4684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return self.entitydefs[s]
4694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                except KeyError:
4704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return '&'+s+';'
4714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
473