14a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 24a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 34a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair__all__ = [ 44a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 'HTMLParserTreeBuilder', 54a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ] 64a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 74a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom HTMLParser import ( 84a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair HTMLParser, 94a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair HTMLParseError, 104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ) 114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairimport sys 124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairimport warnings 134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# argument, which we'd like to set to False. Unfortunately, 164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# http://bugs.python.org/issue13273 makes strict=True a better bet 174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# before Python 3.2.3. 184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# 194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# At the end of this file, we monkeypatch HTMLParser so that 204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# strict=True works well on Python 3.2.2. 214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairmajor, minor, release = sys.version_info[:3] 224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan SinclairCONSTRUCTOR_TAKES_STRICT = ( 234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair major > 3 244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair or (major == 3 and minor > 2) 254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair or (major == 3 and minor == 2 and release >= 3)) 264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.element import ( 284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair CData, 294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair Comment, 304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair Declaration, 314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair Doctype, 324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ProcessingInstruction, 334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ) 344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.dammit import EntitySubstitution, UnicodeDammit 354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairfrom bs4.builder import ( 374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair HTML, 384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair HTMLTreeBuilder, 394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair STRICT, 404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ) 414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan SinclairHTMLPARSER = 'html.parser' 444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairclass BeautifulSoupHTMLParser(HTMLParser): 464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_starttag(self, name, attrs): 474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # XXX namespace 484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attr_dict = {} 494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair for key, value in attrs: 504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # Change None attribute values to the empty string 514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # for consistency with the other tree builders. 524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if value is None: 534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair value = '' 544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attr_dict[key] = value 554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrvalue = '""' 564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_starttag(name, None, None, attr_dict) 574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_endtag(self, name): 594a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_endtag(name) 604a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 614a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_data(self, data): 624a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_data(data) 634a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 644a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_charref(self, name): 654a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # XXX workaround for a bug in HTMLParser. Remove this once 664a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # it's fixed. 674a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if name.startswith('x'): 684a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair real_name = int(name.lstrip('x'), 16) 694a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair elif name.startswith('X'): 704a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair real_name = int(name.lstrip('X'), 16) 714a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 724a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair real_name = int(name) 734a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 744a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair try: 754a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = unichr(real_name) 764a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair except (ValueError, OverflowError), e: 774a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = u"\N{REPLACEMENT CHARACTER}" 784a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 794a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.handle_data(data) 804a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 814a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_entityref(self, name): 824a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 834a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if character is not None: 844a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = character 854a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 864a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = "&%s;" % name 874a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.handle_data(data) 884a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 894a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_comment(self, data): 904a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData() 914a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_data(data) 924a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData(Comment) 934a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 944a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_decl(self, data): 954a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData() 964a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if data.startswith("DOCTYPE "): 974a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = data[len("DOCTYPE "):] 984a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair elif data == 'DOCTYPE': 994a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # i.e. "<!DOCTYPE>" 1004a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = '' 1014a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_data(data) 1024a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData(Doctype) 1034a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1044a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def unknown_decl(self, data): 1054a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if data.upper().startswith('CDATA['): 1064a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair cls = CData 1074a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = data[len('CDATA['):] 1084a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 1094a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair cls = Declaration 1104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData() 1114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_data(data) 1124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData(cls) 1134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def handle_pi(self, data): 1154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData() 1164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if data.endswith("?") and data.lower().startswith("xml"): 1174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # "An XHTML processing instruction using the trailing '?' 1184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # will cause the '?' to be included in data." - HTMLParser 1194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # docs. 1204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # 1214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # Strip the question mark so we don't end up with two 1224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # question marks. 1234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair data = data[:-1] 1244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.handle_data(data) 1254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.soup.endData(ProcessingInstruction) 1264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairclass HTMLParserTreeBuilder(HTMLTreeBuilder): 1294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair is_xml = False 1314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair features = [HTML, STRICT, HTMLPARSER] 1324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def __init__(self, *args, **kwargs): 1344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if CONSTRUCTOR_TAKES_STRICT: 1354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair kwargs['strict'] = False 1364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.parser_args = (args, kwargs) 1374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def prepare_markup(self, markup, user_specified_encoding=None, 1394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair document_declared_encoding=None): 1404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair """ 1414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair :return: A 4-tuple (markup, original encoding, encoding 1424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair declared within markup, whether any characters had to be 1434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair replaced with REPLACEMENT CHARACTER). 1444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair """ 1454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if isinstance(markup, unicode): 1464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair yield (markup, None, None, False) 1474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair return 1484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair try_encodings = [user_specified_encoding, document_declared_encoding] 1504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair dammit = UnicodeDammit(markup, try_encodings, is_html=True) 1514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair yield (dammit.markup, dammit.original_encoding, 1524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair dammit.declared_html_encoding, 1534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair dammit.contains_replacement_characters) 1544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def feed(self, markup): 1564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair args, kwargs = self.parser_args 1574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair parser = BeautifulSoupHTMLParser(*args, **kwargs) 1584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair parser.soup = self.soup 1594a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair try: 1604a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair parser.feed(markup) 1614a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair except HTMLParseError, e: 1624a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair warnings.warn(RuntimeWarning( 1634a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 1644a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair raise e 1654a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1664a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 1674a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 1684a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# string. 1694a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# 1704a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair# XXX This code can be removed once most Python 3 users are on 3.2.3. 1714a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclairif major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 1724a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair import re 1734a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrfind_tolerant = re.compile( 1744a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 1754a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 1764a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 1774a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1784a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair locatestarttagend = re.compile(r""" 1794a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 1804a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair (?:\s+ # whitespace before attribute name 1814a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 1824a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair (?:\s*=\s* # value indicator 1834a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair (?:'[^']*' # LITA-enclosed value 1844a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair |\"[^\"]*\" # LIT-enclosed value 1854a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair |[^'\">\s]+ # bare value 1864a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ) 1874a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair )? 1884a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair ) 1894a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair )* 1904a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair \s* # trailing whitespace 1914a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair""", re.VERBOSE) 1924a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 1934a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1944a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair from html.parser import tagfind, attrfind 1954a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 1964a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def parse_starttag(self, i): 1974a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.__starttag_text = None 1984a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair endpos = self.check_for_whole_start_tag(i) 1994a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if endpos < 0: 2004a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair return endpos 2014a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair rawdata = self.rawdata 2024a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.__starttag_text = rawdata[i:endpos] 2034a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 2044a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # Now parse the data between i+1 and j into a tag and attrs 2054a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrs = [] 2064a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair match = tagfind.match(rawdata, i+1) 2074a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair assert match, 'unexpected call to parse_starttag()' 2084a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair k = match.end() 2094a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.lasttag = tag = rawdata[i+1:k].lower() 2104a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair while k < endpos: 2114a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if self.strict: 2124a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair m = attrfind.match(rawdata, k) 2134a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 2144a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair m = attrfind_tolerant.match(rawdata, k) 2154a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if not m: 2164a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair break 2174a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrname, rest, attrvalue = m.group(1, 2, 3) 2184a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if not rest: 2194a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrvalue = None 2204a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 2214a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrvalue[:1] == '"' == attrvalue[-1:]: 2224a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrvalue = attrvalue[1:-1] 2234a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if attrvalue: 2244a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrvalue = self.unescape(attrvalue) 2254a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair attrs.append((attrname.lower(), attrvalue)) 2264a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair k = m.end() 2274a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 2284a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair end = rawdata[k:endpos].strip() 2294a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if end not in (">", "/>"): 2304a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair lineno, offset = self.getpos() 2314a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if "\n" in self.__starttag_text: 2324a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair lineno = lineno + self.__starttag_text.count("\n") 2334a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair offset = len(self.__starttag_text) \ 2344a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair - self.__starttag_text.rfind("\n") 2354a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 2364a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair offset = offset + len(self.__starttag_text) 2374a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if self.strict: 2384a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.error("junk characters in start tag: %r" 2394a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair % (rawdata[k:endpos][:20],)) 2404a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.handle_data(rawdata[i:endpos]) 2414a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair return endpos 2424a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if end.endswith('/>'): 2434a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair # XHTML-style empty tag: <span attr="value" /> 2444a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.handle_startendtag(tag, attrs) 2454a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair else: 2464a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.handle_starttag(tag, attrs) 2474a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair if tag in self.CDATA_CONTENT_ELEMENTS: 2484a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.set_cdata_mode(tag) 2494a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair return endpos 2504a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 2514a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair def set_cdata_mode(self, elem): 2524a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.cdata_elem = elem.lower() 2534a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 2544a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 2554a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair BeautifulSoupHTMLParser.parse_starttag = parse_starttag 2564a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 2574a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair 2584a4f2fe02baf385f6c24fc98c6e17bf6ac5e0724Dan Sinclair CONSTRUCTOR_TAKES_STRICT = True 259