14a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair"""Use the HTMLParser library to parse HTML files that aren't too bad."""
24a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
34a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair__all__ = [
44a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    'HTMLParserTreeBuilder',
54a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    ]
64a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
74a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom HTMLParser import (
84a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    HTMLParser,
94a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    HTMLParseError,
104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    )
114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairimport sys
124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairimport warnings
134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# argument, which we'd like to set to False. Unfortunately,
164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# http://bugs.python.org/issue13273 makes strict=True a better bet
174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# before Python 3.2.3.
184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair#
194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# At the end of this file, we monkeypatch HTMLParser so that
204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# strict=True works well on Python 3.2.2.
214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairmajor, minor, release = sys.version_info[:3]
224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan SinclairCONSTRUCTOR_TAKES_STRICT = (
234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    major > 3
244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    or (major == 3 and minor > 2)
254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    or (major == 3 and minor == 2 and release >= 3))
264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.element import (
284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    CData,
294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    Comment,
304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    Declaration,
314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    Doctype,
324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    ProcessingInstruction,
334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    )
344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.dammit import EntitySubstitution, UnicodeDammit
354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.builder import (
374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    HTML,
384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    HTMLTreeBuilder,
394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    STRICT,
404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    )
414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan SinclairHTMLPARSER = 'html.parser'
444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairclass BeautifulSoupHTMLParser(HTMLParser):
464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_starttag(self, name, attrs):
474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        # XXX namespace
484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        attr_dict = {}
494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        for key, value in attrs:
504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # Change None attribute values to the empty string
514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # for consistency with the other tree builders.
524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if value is None:
534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                value = ''
544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            attr_dict[key] = value
554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            attrvalue = '""'
564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_starttag(name, None, None, attr_dict)
574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_endtag(self, name):
594a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_endtag(name)
604a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
614a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_data(self, data):
624a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_data(data)
634a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
644a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_charref(self, name):
654a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        # XXX workaround for a bug in HTMLParser. Remove this once
664a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        # it's fixed.
674a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if name.startswith('x'):
684a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            real_name = int(name.lstrip('x'), 16)
694a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        elif name.startswith('X'):
704a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            real_name = int(name.lstrip('X'), 16)
714a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        else:
724a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            real_name = int(name)
734a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
744a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        try:
754a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = unichr(real_name)
764a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        except (ValueError, OverflowError), e:
774a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = u"\N{REPLACEMENT CHARACTER}"
784a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
794a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.handle_data(data)
804a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
814a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_entityref(self, name):
824a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
834a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if character is not None:
844a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = character
854a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        else:
864a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = "&%s;" % name
874a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.handle_data(data)
884a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
894a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_comment(self, data):
904a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData()
914a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_data(data)
924a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData(Comment)
934a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
944a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_decl(self, data):
954a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData()
964a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if data.startswith("DOCTYPE "):
974a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = data[len("DOCTYPE "):]
984a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        elif data == 'DOCTYPE':
994a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # i.e. "<!DOCTYPE>"
1004a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = ''
1014a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_data(data)
1024a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData(Doctype)
1034a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1044a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def unknown_decl(self, data):
1054a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if data.upper().startswith('CDATA['):
1064a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            cls = CData
1074a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = data[len('CDATA['):]
1084a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        else:
1094a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            cls = Declaration
1104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData()
1114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_data(data)
1124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData(cls)
1134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def handle_pi(self, data):
1154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData()
1164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if data.endswith("?") and data.lower().startswith("xml"):
1174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # "An XHTML processing instruction using the trailing '?'
1184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # will cause the '?' to be included in data." - HTMLParser
1194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # docs.
1204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            #
1214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # Strip the question mark so we don't end up with two
1224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # question marks.
1234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            data = data[:-1]
1244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.handle_data(data)
1254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.soup.endData(ProcessingInstruction)
1264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairclass HTMLParserTreeBuilder(HTMLTreeBuilder):
1294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    is_xml = False
1314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    features = [HTML, STRICT, HTMLPARSER]
1324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def __init__(self, *args, **kwargs):
1344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if CONSTRUCTOR_TAKES_STRICT:
1354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            kwargs['strict'] = False
1364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.parser_args = (args, kwargs)
1374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def prepare_markup(self, markup, user_specified_encoding=None,
1394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                       document_declared_encoding=None):
1404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        """
1414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        :return: A 4-tuple (markup, original encoding, encoding
1424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        declared within markup, whether any characters had to be
1434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        replaced with REPLACEMENT CHARACTER).
1444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        """
1454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if isinstance(markup, unicode):
1464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            yield (markup, None, None, False)
1474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            return
1484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        try_encodings = [user_specified_encoding, document_declared_encoding]
1504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
1514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        yield (dammit.markup, dammit.original_encoding,
1524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair               dammit.declared_html_encoding,
1534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair               dammit.contains_replacement_characters)
1544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def feed(self, markup):
1564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        args, kwargs = self.parser_args
1574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        parser = BeautifulSoupHTMLParser(*args, **kwargs)
1584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        parser.soup = self.soup
1594a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        try:
1604a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            parser.feed(markup)
1614a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        except HTMLParseError, e:
1624a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            warnings.warn(RuntimeWarning(
1634a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
1644a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            raise e
1654a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1664a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
1674a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
1684a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# string.
1694a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair#
1704a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# XXX This code can be removed once most Python 3 users are on 3.2.3.
1714a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairif major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
1724a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    import re
1734a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    attrfind_tolerant = re.compile(
1744a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
1754a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
1764a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
1774a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1784a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    locatestarttagend = re.compile(r"""
1794a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
1804a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair  (?:\s+                             # whitespace before attribute name
1814a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
1824a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair      (?:\s*=\s*                     # value indicator
1834a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        (?:'[^']*'                   # LITA-enclosed value
1844a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair          |\"[^\"]*\"                # LIT-enclosed value
1854a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair          |[^'\">\s]+                # bare value
1864a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair         )
1874a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair       )?
1884a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair     )
1894a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair   )*
1904a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair  \s*                                # trailing whitespace
1914a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair""", re.VERBOSE)
1924a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
1934a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1944a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    from html.parser import tagfind, attrfind
1954a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
1964a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def parse_starttag(self, i):
1974a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.__starttag_text = None
1984a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        endpos = self.check_for_whole_start_tag(i)
1994a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if endpos < 0:
2004a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            return endpos
2014a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        rawdata = self.rawdata
2024a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.__starttag_text = rawdata[i:endpos]
2034a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
2044a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        # Now parse the data between i+1 and j into a tag and attrs
2054a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        attrs = []
2064a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        match = tagfind.match(rawdata, i+1)
2074a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        assert match, 'unexpected call to parse_starttag()'
2084a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        k = match.end()
2094a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.lasttag = tag = rawdata[i+1:k].lower()
2104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        while k < endpos:
2114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if self.strict:
2124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                m = attrfind.match(rawdata, k)
2134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            else:
2144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                m = attrfind_tolerant.match(rawdata, k)
2154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if not m:
2164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                break
2174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            attrname, rest, attrvalue = m.group(1, 2, 3)
2184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if not rest:
2194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                attrvalue = None
2204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
2214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                 attrvalue[:1] == '"' == attrvalue[-1:]:
2224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                attrvalue = attrvalue[1:-1]
2234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if attrvalue:
2244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                attrvalue = self.unescape(attrvalue)
2254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            attrs.append((attrname.lower(), attrvalue))
2264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            k = m.end()
2274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
2284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        end = rawdata[k:endpos].strip()
2294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if end not in (">", "/>"):
2304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            lineno, offset = self.getpos()
2314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if "\n" in self.__starttag_text:
2324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                lineno = lineno + self.__starttag_text.count("\n")
2334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                offset = len(self.__starttag_text) \
2344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                         - self.__starttag_text.rfind("\n")
2354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            else:
2364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                offset = offset + len(self.__starttag_text)
2374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if self.strict:
2384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                self.error("junk characters in start tag: %r"
2394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                           % (rawdata[k:endpos][:20],))
2404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            self.handle_data(rawdata[i:endpos])
2414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            return endpos
2424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        if end.endswith('/>'):
2434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            # XHTML-style empty tag: <span attr="value" />
2444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            self.handle_startendtag(tag, attrs)
2454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        else:
2464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            self.handle_starttag(tag, attrs)
2474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair            if tag in self.CDATA_CONTENT_ELEMENTS:
2484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair                self.set_cdata_mode(tag)
2494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        return endpos
2504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
2514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    def set_cdata_mode(self, elem):
2524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.cdata_elem = elem.lower()
2534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
2544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
2554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
2564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
2574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair
2584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair    CONSTRUCTOR_TAKES_STRICT = True
259