10a8c90248264a8b26970b4473770bcc3df8515fJosh Gao"""A parser for HTML and XHTML."""
20a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
30a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# This file is based on sgmllib.py, but the API is slightly different.
40a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
50a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed
60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# character data -- the normal case), RCDATA (replaceable character
70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# data -- only char and entity references and end tags are special)
80a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and CDATA (character data -- only end tags are special).
90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
110a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport markupbase
120a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re
130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Regular expressions used for parsing
150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
160a8c90248264a8b26970b4473770bcc3df8515fJosh Gaointeresting_normal = re.compile('[&<]')
170a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoincomplete = re.compile('&[a-zA-Z#]')
180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
190a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
200a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
220a8c90248264a8b26970b4473770bcc3df8515fJosh Gaostarttagopen = re.compile('<[a-zA-Z]')
230a8c90248264a8b26970b4473770bcc3df8515fJosh Gaopiclose = re.compile('>')
240a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocommentclose = re.compile(r'--\s*>')
250a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
280a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
300a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoattrfind = re.compile(
310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaolocatestarttagend = re.compile(r"""
350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  (?:[\s/]*                          # optional whitespace before attribute name
370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao      (?:\s*=+\s*                    # value indicator
390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        (?:'[^']*'                   # LITA-enclosed value
400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao          |"[^"]*"                   # LIT-enclosed value
410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao          |(?!['"])[^>\s]*           # bare value
420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao         )
430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao       )?(?:\s|/(?!>))*
440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao     )*
450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao   )?
460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  \s*                                # trailing whitespace
470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao""", re.VERBOSE)
480a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendendtag = re.compile('>')
490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# </ and the tag name, so maybe this should be fixed
510a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
540a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLParseError(Exception):
550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    """Exception raised for all parse errors."""
560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __init__(self, msg, position=(None, None)):
580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        assert msg
590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.msg = msg
600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.lineno = position[0]
610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.offset = position[1]
620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __str__(self):
640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        result = self.msg
650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if self.lineno is not None:
660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            result = result + ", at line %d" % self.lineno
670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if self.offset is not None:
680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            result = result + ", column %d" % (self.offset + 1)
690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return result
700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
720a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLParser(markupbase.ParserBase):
730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    """Find tags and other markup and call handler functions.
740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    Usage:
760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        p = HTMLParser()
770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        p.feed(data)
780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        ...
790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        p.close()
800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    Start tags are handled by calling self.handle_starttag() or
820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    self.handle_startendtag(); end tags by self.handle_endtag().  The
830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    data between tags is passed from the parser to the derived class
840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    by calling self.handle_data() with the data as argument (the data
850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    may be split up in arbitrary chunks).  Entity references are
860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    passed by calling self.handle_entityref() with the entity
870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    reference as the argument.  Numeric character references are
880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    passed to self.handle_charref() with the string containing the
890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    reference as the argument.
900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    """
910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    CDATA_CONTENT_ELEMENTS = ("script", "style")
930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __init__(self):
960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Initialize and reset this instance."""
970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.reset()
980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def reset(self):
1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Reset this instance.  Loses all unprocessed data."""
1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = ''
1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.lasttag = '???'
1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.interesting = interesting_normal
1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.cdata_elem = None
1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        markupbase.ParserBase.reset(self)
1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def feed(self, data):
1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        r"""Feed data to the parser.
1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        Call this as often as you want, with as little or as much text
1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        as you want (may include '\n').
1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """
1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = self.rawdata + data
1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.goahead(0)
1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def close(self):
1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Handle any buffered data."""
1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.goahead(1)
1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def error(self, message):
1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        raise HTMLParseError(message, self.getpos())
1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    __starttag_text = None
1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def get_starttag_text(self):
1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Return full source of start tag: '<...>'."""
1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return self.__starttag_text
1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def set_cdata_mode(self, elem):
1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.cdata_elem = elem.lower()
1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def clear_cdata_mode(self):
1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.interesting = interesting_normal
1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.cdata_elem = None
1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- handle data as far as reasonable.  May leave state
1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # and data to be processed by a subsequent call.  If 'end' is
1390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # true, force handling all data as if followed by EOF marker.
1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def goahead(self, end):
1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        i = 0
1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        n = len(rawdata)
1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while i < n:
1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = self.interesting.search(rawdata, i) # < or &
1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if match:
1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                j = match.start()
1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if self.cdata_elem:
1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    break
1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                j = n
1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if i < j: self.handle_data(rawdata[i:j])
1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            i = self.updatepos(i, j)
1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if i == n: break
1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            startswith = rawdata.startswith
1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if startswith('<', i):
1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if starttagopen.match(rawdata, i): # < + letter
1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_starttag(i)
1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif startswith("</", i):
1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_endtag(i)
1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif startswith("<!--", i):
1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_comment(i)
1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif startswith("<?", i):
1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_pi(i)
1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif startswith("<!", i):
1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_html_declaration(i)
1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif (i + 1) < n:
1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_data("<")
1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = i + 1
1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                else:
1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    break
1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if k < 0:
1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if not end:
1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        break
1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = rawdata.find('>', i + 1)
1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0:
1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        k = rawdata.find('<', i + 1)
1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        if k < 0:
1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                            k = i + 1
1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    else:
1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        k += 1
1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_data(rawdata[i:k])
1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                i = self.updatepos(i, k)
1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            elif startswith("&#", i):
1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                match = charref.match(rawdata, i)
1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if match:
1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    name = match.group()[2:-1]
1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_charref(name)
1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = match.end()
1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if not startswith(';', k-1):
1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        k = k - 1
1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = self.updatepos(i, k)
1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                else:
1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if ";" in rawdata[i:]: #bail by consuming &#
1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        self.handle_data(rawdata[0:2])
1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        i = self.updatepos(i, 2)
1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    break
1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            elif startswith('&', i):
2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                match = entityref.match(rawdata, i)
2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if match:
2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    name = match.group(1)
2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_entityref(name)
2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = match.end()
2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if not startswith(';', k-1):
2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        k = k - 1
2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = self.updatepos(i, k)
2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                match = incomplete.match(rawdata, i)
2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if match:
2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # match.group() will contain at least 2 chars
2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if end and match.group() == rawdata[i:]:
2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        self.error("EOF in middle of entity or char ref")
2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # incomplete
2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    break
2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                elif (i + 1) < n:
2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # not the end of the buffer, and can't be confused
2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # with some other construct
2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_data("&")
2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = self.updatepos(i, i + 1)
2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                else:
2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    break
2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                assert 0, "interesting.search() lied"
2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # end while
2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if end and i < n and not self.cdata_elem:
2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(rawdata[i:n])
2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            i = self.updatepos(i, n)
2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = rawdata[i:]
2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse html declarations, return length or -1 if not terminated
2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # See also parse_declaration in _markupbase
2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_html_declaration(self, i):
2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[i:i+2] != '<!':
2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.error('unexpected call to parse_html_declaration()')
2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[i:i+4] == '<!--':
2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # this case is actually already handled in goahead()
2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return self.parse_comment(i)
2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        elif rawdata[i:i+3] == '<![':
2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return self.parse_marked_section(i)
2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        elif rawdata[i:i+9].lower() == '<!doctype':
2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # find the closing >
2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            gtpos = rawdata.find('>', i+9)
2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if gtpos == -1:
2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return -1
2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_decl(rawdata[i+2:gtpos])
2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return gtpos+1
2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return self.parse_bogus_comment(i)
2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse bogus comment, return length or -1 if not terminated
2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_bogus_comment(self, i, report=1):
2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[i:i+2] not in ('<!', '</'):
2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.error('unexpected call to parse_comment()')
2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pos = rawdata.find('>', i+2)
2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if pos == -1:
2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if report:
2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_comment(rawdata[i+2:pos])
2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return pos + 1
2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse processing instr, return end or -1 if not terminated
2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_pi(self, i):
2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = piclose.search(rawdata, i+2) # >
2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.start()
2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_pi(rawdata[i+2: j])
2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.end()
2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return j
2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- handle starttag, return end or -1 if not terminated
2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_starttag(self, i):
2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.__starttag_text = None
2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        endpos = self.check_for_whole_start_tag(i)
2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if endpos < 0:
2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return endpos
2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.__starttag_text = rawdata[i:endpos]
2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # Now parse the data between i+1 and j into a tag and attrs
2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        attrs = []
2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = tagfind.match(rawdata, i+1)
2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        assert match, 'unexpected call to parse_starttag()'
2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        k = match.end()
2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.lasttag = tag = match.group(1).lower()
2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while k < endpos:
2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            m = attrfind.match(rawdata, k)
2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not m:
2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                break
2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            attrname, rest, attrvalue = m.group(1, 2, 3)
2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not rest:
3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                attrvalue = None
3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                 attrvalue[:1] == '"' == attrvalue[-1:]:
3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                attrvalue = attrvalue[1:-1]
3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if attrvalue:
3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                attrvalue = self.unescape(attrvalue)
3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            attrs.append((attrname.lower(), attrvalue))
3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            k = m.end()
3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        end = rawdata[k:endpos].strip()
3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if end not in (">", "/>"):
3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            lineno, offset = self.getpos()
3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if "\n" in self.__starttag_text:
3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                lineno = lineno + self.__starttag_text.count("\n")
3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                offset = len(self.__starttag_text) \
3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                         - self.__starttag_text.rfind("\n")
3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                offset = offset + len(self.__starttag_text)
3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(rawdata[i:endpos])
3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return endpos
3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if end.endswith('/>'):
3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # XHTML-style empty tag: <span attr="value" />
3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_startendtag(tag, attrs)
3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_starttag(tag, attrs)
3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if tag in self.CDATA_CONTENT_ELEMENTS:
3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.set_cdata_mode(tag)
3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return endpos
3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- check to see if we have a complete starttag; return end
3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # or -1 if incomplete.
3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def check_for_whole_start_tag(self, i):
3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        m = locatestarttagend.match(rawdata, i)
3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if m:
3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            j = m.end()
3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            next = rawdata[j:j+1]
3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if next == ">":
3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return j + 1
3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if next == "/":
3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("/>", j):
3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return j + 2
3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("/", j):
3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # buffer boundary
3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return -1
3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # else bogus input
3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.updatepos(i, j + 1)
3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.error("malformed empty start tag")
3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if next == "":
3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # end of input
3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return -1
3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if next in ("abcdefghijklmnopqrstuvwxyz=/"
3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # end of input in or before attribute value, or we have the
3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # '/' from a '/>' ending
3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return -1
3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if j > i:
3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return j
3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return i + 1
3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        raise AssertionError("we should not get here!")
3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse endtag, return end or -1 if incomplete
3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_endtag(self, i):
3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = endendtag.search(rawdata, i+1) # >
3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        gtpos = match.end()
3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = endtagfind.match(rawdata, i) # </ + tag + >
3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if self.cdata_elem is not None:
3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_data(rawdata[i:gtpos])
3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return gtpos
3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            namematch = tagfind_tolerant.match(rawdata, i+2)
3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not namematch:
3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # w3.org/TR/html5/tokenization.html#end-tag-open-state
3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata[i:i+3] == '</>':
3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return i+3
3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                else:
3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return self.parse_bogus_comment(i)
3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tagname = namematch.group().lower()
3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # consume and ignore other stuff between the name and the >
3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # Note: this is not 100% correct, since we might have things like
3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # </tag attr=">">, but looking for > after tha name should cover
3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # most of the cases and is much simpler
3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            gtpos = rawdata.find('>', namematch.end())
3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_endtag(tagname)
3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return gtpos+1
3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        elem = match.group(1).lower() # script or style
3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if self.cdata_elem is not None:
3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if elem != self.cdata_elem:
3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_data(rawdata[i:gtpos])
3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return gtpos
3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_endtag(elem)
3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.clear_cdata_mode()
4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return gtpos
4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- finish processing of start+end tag: <tag.../>
4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_startendtag(self, tag, attrs):
4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_starttag(tag, attrs)
4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_endtag(tag)
4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle start tag
4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_starttag(self, tag, attrs):
4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle end tag
4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_endtag(self, tag):
4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle character reference
4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_charref(self, name):
4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle entity reference
4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_entityref(self, name):
4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle data
4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_data(self, data):
4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle comment
4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_comment(self, data):
4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle declaration
4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_decl(self, decl):
4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle processing instruction
4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_pi(self, data):
4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_decl(self, data):
4400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- helper to remove special character quoting
4430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    entitydefs = None
4440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unescape(self, s):
4450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if '&' not in s:
4460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return s
4470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        def replaceEntities(s):
4480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            s = s.groups()[0]
4490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            try:
4500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if s[0] == "#":
4510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    s = s[1:]
4520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if s[0] in ['x','X']:
4530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        c = int(s[1:], 16)
4540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    else:
4550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        c = int(s)
4560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return unichr(c)
4570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            except ValueError:
4580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return '&#'+s+';'
4590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
4600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # Cannot use name2codepoint directly, because HTMLParser supports apos,
4610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                # which is not part of HTML 4
4620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                import htmlentitydefs
4630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if HTMLParser.entitydefs is None:
4640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
4650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    for k, v in htmlentitydefs.name2codepoint.iteritems():
4660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        entitydefs[k] = unichr(v)
4670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                try:
4680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return self.entitydefs[s]
4690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                except KeyError:
4700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    return '&'+s+';'
4710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
473