1d995e1150cab57ed7c885d4b7dd943495022936bFred Drake"""A parser for HTML and XHTML."""
2d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
3d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# This file is based on sgmllib.py, but the API is slightly different.
4d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
5d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# XXX There should be a way to distinguish between PCDATA (parsed
6d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# character data -- the normal case), RCDATA (replaceable character
7d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# data -- only char and entity references and end tags are special)
8d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# and CDATA (character data -- only end tags are special).
9d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
10d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
11d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeimport markupbase
12d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeimport re
13d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
14d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# Regular expressions used for parsing
15d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
16d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeinteresting_normal = re.compile('[&<]')
17d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeincomplete = re.compile('&[a-zA-Z#]')
18d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
19d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20d995e1150cab57ed7c885d4b7dd943495022936bFred Drakecharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
22d995e1150cab57ed7c885d4b7dd943495022936bFred Drakestarttagopen = re.compile('<[a-zA-Z]')
23d995e1150cab57ed7c885d4b7dd943495022936bFred Drakepiclose = re.compile('>')
24d995e1150cab57ed7c885d4b7dd943495022936bFred Drakecommentclose = re.compile(r'--\s*>')
25b8147452265077d4c12464a9943903f0d040f79cEzio Melotti
26f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28b8147452265077d4c12464a9943903f0d040f79cEzio Melotti# note: if you change tagfind/attrfind remember to update locatestarttagend too
29b8147452265077d4c12464a9943903f0d040f79cEzio Melottitagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
30b8147452265077d4c12464a9943903f0d040f79cEzio Melotti# this regex is currently unused, but left for backward compatibility
31f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melottitagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
320f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti
33d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeattrfind = re.compile(
34c45868ec697a70a80d1cf8a511894f073fda3a27Ezio Melotti    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
3536b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
36d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
37d995e1150cab57ed7c885d4b7dd943495022936bFred Drakelocatestarttagend = re.compile(r"""
38b8147452265077d4c12464a9943903f0d040f79cEzio Melotti  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
3936b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti  (?:[\s/]*                          # optional whitespace before attribute name
4036b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
410f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti      (?:\s*=+\s*                    # value indicator
42d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        (?:'[^']*'                   # LITA-enclosed value
430f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti          |"[^"]*"                   # LIT-enclosed value
440f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti          |(?!['"])[^>\s]*           # bare value
45d995e1150cab57ed7c885d4b7dd943495022936bFred Drake         )
4636b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti       )?(?:\s|/(?!>))*
470f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti     )*
480f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti   )?
49d995e1150cab57ed7c885d4b7dd943495022936bFred Drake  \s*                                # trailing whitespace
50d995e1150cab57ed7c885d4b7dd943495022936bFred Drake""", re.VERBOSE)
51d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeendendtag = re.compile('>')
527e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
537e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti# </ and the tag name, so maybe this should be fixed
54d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
55d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
56d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
57d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeclass HTMLParseError(Exception):
58d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    """Exception raised for all parse errors."""
59d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
60d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def __init__(self, msg, position=(None, None)):
61d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        assert msg
62d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.msg = msg
63d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.lineno = position[0]
64d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.offset = position[1]
65d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
66d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def __str__(self):
67d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        result = self.msg
68d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if self.lineno is not None:
69d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            result = result + ", at line %d" % self.lineno
70d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if self.offset is not None:
71d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            result = result + ", column %d" % (self.offset + 1)
72d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        return result
73d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
74d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
75d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeclass HTMLParser(markupbase.ParserBase):
76d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    """Find tags and other markup and call handler functions.
77d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
78d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    Usage:
79d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        p = HTMLParser()
80d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        p.feed(data)
81d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        ...
82d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        p.close()
83d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
84d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    Start tags are handled by calling self.handle_starttag() or
85d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    self.handle_startendtag(); end tags by self.handle_endtag().  The
86d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    data between tags is passed from the parser to the derived class
87d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    by calling self.handle_data() with the data as argument (the data
88d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    may be split up in arbitrary chunks).  Entity references are
89d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    passed by calling self.handle_entityref() with the entity
90d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    reference as the argument.  Numeric character references are
91d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    passed to self.handle_charref() with the string containing the
92d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    reference as the argument.
93d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    """
94d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
95d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    CDATA_CONTENT_ELEMENTS = ("script", "style")
96d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
97d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
98d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def __init__(self):
99d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        """Initialize and reset this instance."""
100d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.reset()
101d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
102d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def reset(self):
103d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        """Reset this instance.  Loses all unprocessed data."""
104d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.rawdata = ''
105d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.lasttag = '???'
106d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.interesting = interesting_normal
1077e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        self.cdata_elem = None
108d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        markupbase.ParserBase.reset(self)
109d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
110d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def feed(self, data):
11131890bc9ba7c46ed5bcdb91de39f7251badca8b1Éric Araujo        r"""Feed data to the parser.
112d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
113d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        Call this as often as you want, with as little or as much text
114d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        as you want (may include '\n').
115d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        """
116d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.rawdata = self.rawdata + data
117d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.goahead(0)
118d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
119d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def close(self):
120d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        """Handle any buffered data."""
121d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.goahead(1)
122d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
123d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def error(self, message):
124d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        raise HTMLParseError(message, self.getpos())
125d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
126d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    __starttag_text = None
127d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
128d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def get_starttag_text(self):
129d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        """Return full source of start tag: '<...>'."""
130d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        return self.__starttag_text
131d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
1327e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti    def set_cdata_mode(self, elem):
1337e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        self.cdata_elem = elem.lower()
13400dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
135d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
136d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def clear_cdata_mode(self):
137d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.interesting = interesting_normal
1387e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        self.cdata_elem = None
139d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
140d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- handle data as far as reasonable.  May leave state
141d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # and data to be processed by a subsequent call.  If 'end' is
142d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # true, force handling all data as if followed by EOF marker.
143d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def goahead(self, end):
144d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        rawdata = self.rawdata
145d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        i = 0
146d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        n = len(rawdata)
147d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        while i < n:
148d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            match = self.interesting.search(rawdata, i) # < or &
149d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if match:
150d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                j = match.start()
151d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            else:
15200dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti                if self.cdata_elem:
15300dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti                    break
154d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                j = n
155d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if i < j: self.handle_data(rawdata[i:j])
156d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            i = self.updatepos(i, j)
157d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if i == n: break
158d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            startswith = rawdata.startswith
159d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if startswith('<', i):
160d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if starttagopen.match(rawdata, i): # < + letter
161d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = self.parse_starttag(i)
162d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif startswith("</", i):
163d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = self.parse_endtag(i)
164d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif startswith("<!--", i):
165d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = self.parse_comment(i)
166d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif startswith("<?", i):
167d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = self.parse_pi(i)
168d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif startswith("<!", i):
1694b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti                    k = self.parse_html_declaration(i)
170d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif (i + 1) < n:
171d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    self.handle_data("<")
172d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = i + 1
173d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                else:
174d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    break
175d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if k < 0:
176d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                    if not end:
177d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                        break
178d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                    k = rawdata.find('>', i + 1)
179d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                    if k < 0:
180d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                        k = rawdata.find('<', i + 1)
181d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                        if k < 0:
182d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                            k = i + 1
183d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                    else:
184d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                        k += 1
185d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti                    self.handle_data(rawdata[i:k])
186d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                i = self.updatepos(i, k)
187d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            elif startswith("&#", i):
188d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                match = charref.match(rawdata, i)
189d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if match:
190d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    name = match.group()[2:-1]
191d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    self.handle_charref(name)
192d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = match.end()
193d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    if not startswith(';', k-1):
194d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                        k = k - 1
195d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    i = self.updatepos(i, k)
196d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    continue
197d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                else:
1985a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti                    if ";" in rawdata[i:]:  # bail by consuming '&#'
1995a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti                        self.handle_data(rawdata[i:i+2])
2005a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti                        i = self.updatepos(i, i+2)
201d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    break
202d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            elif startswith('&', i):
203d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                match = entityref.match(rawdata, i)
204d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if match:
205d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    name = match.group(1)
206d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    self.handle_entityref(name)
207d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    k = match.end()
208d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    if not startswith(';', k-1):
209d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                        k = k - 1
210d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    i = self.updatepos(i, k)
211d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    continue
212d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                match = incomplete.match(rawdata, i)
213d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if match:
214d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    # match.group() will contain at least 2 chars
215d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    if end and match.group() == rawdata[i:]:
216d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                        self.error("EOF in middle of entity or char ref")
217d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    # incomplete
218d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    break
219d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                elif (i + 1) < n:
220d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    # not the end of the buffer, and can't be confused
221d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    # with some other construct
222d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    self.handle_data("&")
223d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    i = self.updatepos(i, i + 1)
224d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                else:
225d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    break
226d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            else:
227d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                assert 0, "interesting.search() lied"
228d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        # end while
22900dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti        if end and i < n and not self.cdata_elem:
230d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            self.handle_data(rawdata[i:n])
231d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            i = self.updatepos(i, n)
232d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.rawdata = rawdata[i:]
233d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
2344b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    # Internal -- parse html declarations, return length or -1 if not terminated
2354b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
2364b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    # See also parse_declaration in _markupbase
2374b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    def parse_html_declaration(self, i):
2384b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        rawdata = self.rawdata
2394b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        if rawdata[i:i+2] != '<!':
2404b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            self.error('unexpected call to parse_html_declaration()')
2414b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        if rawdata[i:i+4] == '<!--':
242369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti            # this case is actually already handled in goahead()
2434b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            return self.parse_comment(i)
2444b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        elif rawdata[i:i+3] == '<![':
2454b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            return self.parse_marked_section(i)
2464b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        elif rawdata[i:i+9].lower() == '<!doctype':
2474b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            # find the closing >
248369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti            gtpos = rawdata.find('>', i+9)
2494b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            if gtpos == -1:
2504b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti                return -1
2514b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            self.handle_decl(rawdata[i+2:gtpos])
2524b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            return gtpos+1
2534b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        else:
2544b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            return self.parse_bogus_comment(i)
2554b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti
2564b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    # Internal -- parse bogus comment, return length or -1 if not terminated
2574b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
2584b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti    def parse_bogus_comment(self, i, report=1):
2594b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        rawdata = self.rawdata
260f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti        if rawdata[i:i+2] not in ('<!', '</'):
2614b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            self.error('unexpected call to parse_comment()')
2624b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        pos = rawdata.find('>', i+2)
2634b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        if pos == -1:
2644b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            return -1
2654b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        if report:
2664b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti            self.handle_comment(rawdata[i+2:pos])
2674b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti        return pos + 1
2684b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti
269d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- parse processing instr, return end or -1 if not terminated
270d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def parse_pi(self, i):
271d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        rawdata = self.rawdata
272d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
273d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        match = piclose.search(rawdata, i+2) # >
274d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if not match:
275d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            return -1
276d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        j = match.start()
277d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.handle_pi(rawdata[i+2: j])
278d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        j = match.end()
279d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        return j
280d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
281d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- handle starttag, return end or -1 if not terminated
282d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def parse_starttag(self, i):
283d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.__starttag_text = None
284d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        endpos = self.check_for_whole_start_tag(i)
285d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if endpos < 0:
286d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            return endpos
287d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        rawdata = self.rawdata
288d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.__starttag_text = rawdata[i:endpos]
289d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
290d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        # Now parse the data between i+1 and j into a tag and attrs
291d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        attrs = []
292d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        match = tagfind.match(rawdata, i+1)
293d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        assert match, 'unexpected call to parse_starttag()'
294d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        k = match.end()
295c45868ec697a70a80d1cf8a511894f073fda3a27Ezio Melotti        self.lasttag = tag = match.group(1).lower()
296d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
297d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        while k < endpos:
298d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            m = attrfind.match(rawdata, k)
299d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if not m:
300d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                break
301d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            attrname, rest, attrvalue = m.group(1, 2, 3)
302d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if not rest:
303d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                attrvalue = None
304d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
305d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                 attrvalue[:1] == '"' == attrvalue[-1:]:
306d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                attrvalue = attrvalue[1:-1]
3070f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti            if attrvalue:
308d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                attrvalue = self.unescape(attrvalue)
309d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            attrs.append((attrname.lower(), attrvalue))
310d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            k = m.end()
311d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
312d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        end = rawdata[k:endpos].strip()
313d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if end not in (">", "/>"):
314d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            lineno, offset = self.getpos()
315d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if "\n" in self.__starttag_text:
316d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                lineno = lineno + self.__starttag_text.count("\n")
317d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                offset = len(self.__starttag_text) \
318d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                         - self.__starttag_text.rfind("\n")
319d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            else:
320d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                offset = offset + len(self.__starttag_text)
32165d36dab4d915eb9fada52b867301b546e840faeEzio Melotti            self.handle_data(rawdata[i:endpos])
32265d36dab4d915eb9fada52b867301b546e840faeEzio Melotti            return endpos
323d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if end.endswith('/>'):
324d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            # XHTML-style empty tag: <span attr="value" />
325d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            self.handle_startendtag(tag, attrs)
326d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        else:
327d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            self.handle_starttag(tag, attrs)
328d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if tag in self.CDATA_CONTENT_ELEMENTS:
3297e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti                self.set_cdata_mode(tag)
330d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        return endpos
331d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
332d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- check to see if we have a complete starttag; return end
333d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # or -1 if incomplete.
334d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def check_for_whole_start_tag(self, i):
335d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        rawdata = self.rawdata
336d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        m = locatestarttagend.match(rawdata, i)
337d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if m:
338d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            j = m.end()
339d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            next = rawdata[j:j+1]
340d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if next == ">":
341d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                return j + 1
342d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if next == "/":
343d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if rawdata.startswith("/>", j):
344d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    return j + 2
345d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if rawdata.startswith("/", j):
346d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    # buffer boundary
347d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    return -1
348d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # else bogus input
349d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                self.updatepos(i, j + 1)
350d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                self.error("malformed empty start tag")
351d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if next == "":
352d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # end of input
353d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                return -1
354d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            if next in ("abcdefghijklmnopqrstuvwxyz=/"
355d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
356d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # end of input in or before attribute value, or we have the
357d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # '/' from a '/>' ending
358d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                return -1
35965d36dab4d915eb9fada52b867301b546e840faeEzio Melotti            if j > i:
36065d36dab4d915eb9fada52b867301b546e840faeEzio Melotti                return j
36165d36dab4d915eb9fada52b867301b546e840faeEzio Melotti            else:
36265d36dab4d915eb9fada52b867301b546e840faeEzio Melotti                return i + 1
363d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        raise AssertionError("we should not get here!")
364d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
365d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- parse endtag, return end or -1 if incomplete
366d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def parse_endtag(self, i):
367d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        rawdata = self.rawdata
368d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
369d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        match = endendtag.search(rawdata, i+1) # >
370d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if not match:
371d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            return -1
372f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti        gtpos = match.end()
373d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        match = endtagfind.match(rawdata, i) # </ + tag + >
374d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if not match:
3757e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti            if self.cdata_elem is not None:
376f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                self.handle_data(rawdata[i:gtpos])
377f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                return gtpos
378f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
379b8147452265077d4c12464a9943903f0d040f79cEzio Melotti            namematch = tagfind.match(rawdata, i+2)
380f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            if not namematch:
381f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                # w3.org/TR/html5/tokenization.html#end-tag-open-state
382f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                if rawdata[i:i+3] == '</>':
383f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                    return i+3
384f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                else:
385f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                    return self.parse_bogus_comment(i)
386b8147452265077d4c12464a9943903f0d040f79cEzio Melotti            tagname = namematch.group(1).lower()
387f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            # consume and ignore other stuff between the name and the >
388f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            # Note: this is not 100% correct, since we might have things like
389f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            # </tag attr=">">, but looking for > after tha name should cover
390f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            # most of the cases and is much simpler
391f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            gtpos = rawdata.find('>', namematch.end())
392f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            self.handle_endtag(tagname)
393f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti            return gtpos+1
3947e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti
3957e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        elem = match.group(1).lower() # script or style
3967e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        if self.cdata_elem is not None:
3977e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti            if elem != self.cdata_elem:
398f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                self.handle_data(rawdata[i:gtpos])
399f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti                return gtpos
4007e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti
4017e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti        self.handle_endtag(elem)
402d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.clear_cdata_mode()
403f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti        return gtpos
404d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
405d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- finish processing of start+end tag: <tag.../>
406d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_startendtag(self, tag, attrs):
407d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.handle_starttag(tag, attrs)
408d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        self.handle_endtag(tag)
409d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
410d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle start tag
411d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_starttag(self, tag, attrs):
412d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
413d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
414d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle end tag
415d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_endtag(self, tag):
416d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
417d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
418d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle character reference
419d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_charref(self, name):
420d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
421d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
422d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle entity reference
423d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_entityref(self, name):
424d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
425d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
426d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle data
427d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_data(self, data):
428d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
429d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
430d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle comment
431d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_comment(self, data):
432d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
433d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
434d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle declaration
435d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_decl(self, decl):
436d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
437d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
438d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Overridable -- handle processing instruction
439d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def handle_pi(self, data):
440d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        pass
441d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
442d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def unknown_decl(self, data):
443369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti        pass
444d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
445d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    # Internal -- helper to remove special character quoting
446d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    entitydefs = None
447d995e1150cab57ed7c885d4b7dd943495022936bFred Drake    def unescape(self, s):
448d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        if '&' not in s:
449d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            return s
450d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        def replaceEntities(s):
451d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            s = s.groups()[0]
4523f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran            try:
4533f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                if s[0] == "#":
4543f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                    s = s[1:]
4553f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                    if s[0] in ['x','X']:
4563f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                        c = int(s[1:], 16)
4573f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                    else:
4583f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                        c = int(s)
4593f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                    return unichr(c)
4603f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran            except ValueError:
4613f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran                return '&#'+s+';'
462d995e1150cab57ed7c885d4b7dd943495022936bFred Drake            else:
463d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # Cannot use name2codepoint directly, because HTMLParser supports apos,
464d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                # which is not part of HTML 4
465d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                import htmlentitydefs
466d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                if HTMLParser.entitydefs is None:
467d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
468d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    for k, v in htmlentitydefs.name2codepoint.iteritems():
469d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                        entitydefs[k] = unichr(v)
470d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                try:
471d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    return self.entitydefs[s]
472d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                except KeyError:
473d995e1150cab57ed7c885d4b7dd943495022936bFred Drake                    return '&'+s+';'
474d995e1150cab57ed7c885d4b7dd943495022936bFred Drake
475d995e1150cab57ed7c885d4b7dd943495022936bFred Drake        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
476