10a8c90248264a8b26970b4473770bcc3df8515fJosh Gao"""A parser for SGML, using the derived class as a static DTD."""
20a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
30a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX This only supports those SGML features used by HTML.
40a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
50a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed
60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# character data -- the normal case), RCDATA (replaceable character
70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# data -- only char and entity references and end tags are special)
80a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and CDATA (character data -- only end tags are special).  RCDATA is
90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# not supported at all.
100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
120a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom warnings import warnpy3k
130a8c90248264a8b26970b4473770bcc3df8515fJosh Gaowarnpy3k("the sgmllib module has been removed in Python 3.0",
140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao         stacklevel=2)
150a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodel warnpy3k
160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
170a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport markupbase
180a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re
190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao__all__ = ["SGMLParser", "SGMLParseError"]
210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Regular expressions used for parsing
230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
240a8c90248264a8b26970b4473770bcc3df8515fJosh Gaointeresting = re.compile('[&<]')
250a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoincomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                           '<([a-zA-Z][^<>]*|'
270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                              '/([a-zA-Z][^<>]*)?|'
280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                              '![^<>]*)?')
290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
300a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
310a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocharref = re.compile('&#([0-9]+)[^0-9]')
320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
330a8c90248264a8b26970b4473770bcc3df8515fJosh Gaostarttagopen = re.compile('<[>a-zA-Z]')
340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoshorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
350a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoshorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
360a8c90248264a8b26970b4473770bcc3df8515fJosh Gaopiclose = re.compile('>')
370a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendbracket = re.compile('[<>]')
380a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
390a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoattrfind = re.compile(
400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
440a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParseError(RuntimeError):
450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    """Exception raised for all parse errors."""
460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    pass
470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# SGML parser base class -- find tags and call handler functions.
500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# The dtd is defined by deriving a class which defines methods
520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# with special names to handle tags: start_foo and end_foo to handle
530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# (Tags are converted to lower case for this purpose.)  The data
550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# between tags is passed to the parser by calling self.handle_data()
560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# with some data as argument (the data may be split up in arbitrary
570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# chunks).  Entity references are passed by calling
580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# self.handle_entityref() with the entity reference as argument.
590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
600a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParser(markupbase.ParserBase):
610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Definition of entities -- derived classes may override
620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    entity_or_charref = re.compile('&(?:'
630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao      ')(;?)')
650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __init__(self, verbose=0):
670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Initialize and reset this instance."""
680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.verbose = verbose
690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.reset()
700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def reset(self):
720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Reset this instance. Loses all unprocessed data."""
730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.__starttag_text = None
740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = ''
750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.stack = []
760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.lasttag = '???'
770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.nomoretags = 0
780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.literal = 0
790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        markupbase.ParserBase.reset(self)
800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def setnomoretags(self):
820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Enter literal mode (CDATA) till EOF.
830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        Intended for derived classes only.
850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """
860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.nomoretags = self.literal = 1
870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def setliteral(self, *args):
890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Enter literal mode (CDATA).
900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        Intended for derived classes only.
920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """
930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.literal = 1
940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def feed(self, data):
960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Feed some data to the parser.
970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        Call this as often as you want, with as little or as much text
990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        as you want (may include '\n').  (This just saves the text,
1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        all the processing is done by goahead().)
1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """
1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = self.rawdata + data
1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.goahead(0)
1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def close(self):
1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Handle the remaining data."""
1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.goahead(1)
1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def error(self, message):
1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        raise SGMLParseError(message)
1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- handle data as far as reasonable.  May leave state
1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # and data to be processed by a subsequent call.  If 'end' is
1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # true, force handling all data as if followed by EOF marker.
1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def goahead(self, end):
1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        i = 0
1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        n = len(rawdata)
1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while i < n:
1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if self.nomoretags:
1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_data(rawdata[i:n])
1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                i = n
1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                break
1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = interesting.search(rawdata, i)
1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if match: j = match.start()
1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else: j = n
1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if i < j:
1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_data(rawdata[i:j])
1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            i = j
1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if i == n: break
1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if rawdata[i] == '<':
1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if starttagopen.match(rawdata, i):
1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if self.literal:
1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        self.handle_data(rawdata[i])
1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        i = i+1
1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        continue
1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_starttag(i)
1390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0: break
1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = k
1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("</", i):
1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_endtag(i)
1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0: break
1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = k
1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.literal = 0
1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if self.literal:
1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if n > (i + 1):
1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        self.handle_data("<")
1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        i = i+1
1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    else:
1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        # incomplete
1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        break
1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("<!--", i):
1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        # Strictly speaking, a comment is --.*--
1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        # within a declaration tag <!...>.
1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        # This should be removed,
1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        # and comments handled only in parse_declaration.
1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_comment(i)
1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0: break
1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = k
1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("<?", i):
1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_pi(i)
1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0: break
1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = i+k
1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if rawdata.startswith("<!", i):
1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # This is some sort of declaration; in "HTML as
1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # deployed," this should only be the document type
1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # declaration ("<!DOCTYPE html...>").
1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    k = self.parse_declaration(i)
1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if k < 0: break
1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = k
1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            elif rawdata[i] == '&':
1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if self.literal:
1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_data(rawdata[i])
1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = i+1
1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                match = charref.match(rawdata, i)
1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if match:
1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    name = match.group(1)
1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_charref(name)
1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = match.end(0)
1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if rawdata[i-1] != ';': i = i-1
1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                match = entityref.match(rawdata, i)
1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if match:
1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    name = match.group(1)
1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.handle_entityref(name)
1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    i = match.end(0)
1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    if rawdata[i-1] != ';': i = i-1
1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    continue
1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.error('neither < nor & ??')
1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # We get here only if incomplete matches but
2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # nothing else
2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = incomplete.match(rawdata, i)
2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not match:
2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_data(rawdata[i])
2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                i = i+1
2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                continue
2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            j = match.end(0)
2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if j == n:
2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                break # Really incomplete
2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(rawdata[i:j])
2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            i = j
2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # end while
2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if end and i < n:
2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(rawdata[i:n])
2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            i = n
2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.rawdata = rawdata[i:]
2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # XXX if end: check for empty stack
2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Extensions for the DOCTYPE scanner:
2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    _decl_otherchars = '='
2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse processing instr, return length or -1 if not terminated
2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_pi(self, i):
2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[i:i+2] != '<?':
2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.error('unexpected call to parse_pi()')
2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = piclose.search(rawdata, i+2)
2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.start(0)
2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_pi(rawdata[i+2: j])
2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.end(0)
2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return j-i
2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def get_starttag_text(self):
2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return self.__starttag_text
2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- handle starttag, return length or -1 if not terminated
2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_starttag(self, i):
2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.__starttag_text = None
2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        start_pos = i
2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if shorttagopen.match(rawdata, i):
2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # SGML shorthand: <tag/data/ == <tag>data</tag>
2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # XXX Can data contain &... (entity or char refs)?
2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # XXX Can data contain < or > (tag characters)?
2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # XXX Can there be whitespace before the first /?
2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = shorttag.match(rawdata, i)
2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not match:
2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return -1
2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tag, data = match.group(1, 2)
2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.__starttag_text = '<%s/' % tag
2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tag = tag.lower()
2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            k = match.end(0)
2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.finish_shorttag(tag, data)
2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return k
2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # XXX The following should skip matching quotes (' or ")
2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # As a shortcut way to exit, this isn't so bad, but shouldn't
2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # be used to locate the actual end of the start tag since the
2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # < or > characters may be embedded in an attribute value.
2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = endbracket.search(rawdata, i+1)
2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.start(0)
2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # Now parse the data between i+1 and j into a tag and attrs
2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        attrs = []
2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[i:i+2] == '<>':
2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            # SGML shorthand: <> == <last open tag seen>
2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            k = j
2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tag = self.lasttag
2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = tagfind.match(rawdata, i+1)
2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not match:
2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.error('unexpected call to parse_starttag')
2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            k = match.end(0)
2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tag = rawdata[i+1:k].lower()
2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.lasttag = tag
2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while k < j:
2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            match = attrfind.match(rawdata, k)
2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not match: break
2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            attrname, rest, attrvalue = match.group(1, 2, 3)
2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if not rest:
2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                attrvalue = attrname
2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if (attrvalue[:1] == "'" == attrvalue[-1:] or
2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    attrvalue[:1] == '"' == attrvalue[-1:]):
2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    # strip quotes
2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    attrvalue = attrvalue[1:-1]
2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                attrvalue = self.entity_or_charref.sub(
2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self._convert_ref, attrvalue)
2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            attrs.append((attrname.lower(), attrvalue))
2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            k = match.end(0)
2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[j] == '>':
2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            j = j+1
2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.__starttag_text = rawdata[start_pos:j]
2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.finish_starttag(tag, attrs)
2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return j
2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- convert entity or character reference
3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def _convert_ref(self, match):
3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if match.group(2):
3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return self.convert_charref(match.group(2)) or \
3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                '&#%s%s' % match.groups()[1:]
3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        elif match.group(3):
3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return self.convert_entityref(match.group(1)) or \
3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                '&%s;' % match.group(1)
3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return '&%s' % match.group(1)
3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- parse endtag
3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def parse_endtag(self, i):
3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        rawdata = self.rawdata
3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        match = endbracket.search(rawdata, i+1)
3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not match:
3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return -1
3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        j = match.start(0)
3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        tag = rawdata[i+2:j].strip().lower()
3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if rawdata[j] == '>':
3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            j = j+1
3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.finish_endtag(tag)
3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return j
3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def finish_shorttag(self, tag, data):
3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.finish_starttag(tag, [])
3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.handle_data(data)
3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.finish_endtag(tag)
3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- finish processing of start tag
3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def finish_starttag(self, tag, attrs):
3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            method = getattr(self, 'start_' + tag)
3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except AttributeError:
3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            try:
3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                method = getattr(self, 'do_' + tag)
3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            except AttributeError:
3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.unknown_starttag(tag, attrs)
3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return -1
3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_starttag(tag, method, attrs)
3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return 0
3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.stack.append(tag)
3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_starttag(tag, method, attrs)
3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return 1
3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Internal -- finish processing of end tag
3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def finish_endtag(self, tag):
3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not tag:
3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            found = len(self.stack) - 1
3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if found < 0:
3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.unknown_endtag(tag)
3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return
3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if tag not in self.stack:
3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                try:
3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    method = getattr(self, 'end_' + tag)
3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                except AttributeError:
3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.unknown_endtag(tag)
3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                else:
3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                    self.report_unbalanced(tag)
3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                return
3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            found = len(self.stack)
3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            for i in range(found):
3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                if self.stack[i] == tag: found = i
3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while len(self.stack) > found:
3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            tag = self.stack[-1]
3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            try:
3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                method = getattr(self, 'end_' + tag)
3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            except AttributeError:
3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                method = None
3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if method:
3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.handle_endtag(tag, method)
3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                self.unknown_endtag(tag)
3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            del self.stack[-1]
3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle start tag
3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_starttag(self, tag, method, attrs):
3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        method(attrs)
3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Overridable -- handle end tag
3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_endtag(self, tag, method):
3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        method()
3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Example -- report an unbalanced </...> tag.
3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def report_unbalanced(self, tag):
3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if self.verbose:
3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print '*** Unbalanced </' + tag + '>'
3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print '*** Stack:', self.stack
3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_charref(self, name):
3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Convert character reference, may be overridden."""
3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            n = int(name)
3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except ValueError:
3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return
3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not 0 <= n <= 127:
4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return
4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return self.convert_codepoint(n)
4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_codepoint(self, codepoint):
4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return chr(codepoint)
4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_charref(self, name):
4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Handle character reference, no need to override."""
4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        replacement = self.convert_charref(name)
4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if replacement is None:
4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.unknown_charref(name)
4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(replacement)
4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Definition of entities -- derived classes may override
4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    entitydefs = \
4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_entityref(self, name):
4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Convert entity references.
4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        As an alternative to overriding this method; one can tailor the
4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        results by setting up the self.entitydefs mapping appropriately.
4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """
4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        table = self.entitydefs
4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if name in table:
4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return table[name]
4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return
4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_entityref(self, name):
4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Handle entity references, no need to override."""
4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        replacement = self.convert_entityref(name)
4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if replacement is None:
4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.unknown_entityref(name)
4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.handle_data(replacement)
4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Example -- handle data, should be overridden
4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_data(self, data):
4400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Example -- handle comment, could be overridden
4430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_comment(self, data):
4440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Example -- handle declaration, could be overridden
4470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_decl(self, decl):
4480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # Example -- handle processing instruction, could be overridden
4510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_pi(self, data):
4520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        pass
4530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # To be overridden -- handlers for unknown objects
4550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_starttag(self, tag, attrs): pass
4560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_endtag(self, tag): pass
4570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_charref(self, ref): pass
4580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_entityref(self, ref): pass
4590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4610a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass TestSGMLParser(SGMLParser):
4620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __init__(self, verbose=0):
4640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.testdata = ""
4650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        SGMLParser.__init__(self, verbose)
4660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_data(self, data):
4680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.testdata = self.testdata + data
4690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if len(repr(self.testdata)) >= 70:
4700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.flush()
4710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def flush(self):
4730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        data = self.testdata
4740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if data:
4750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.testdata = ""
4760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print 'data:', repr(data)
4770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_comment(self, data):
4790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
4800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        r = repr(data)
4810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if len(r) > 68:
4820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            r = r[:32] + '...' + r[-32:]
4830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        print 'comment:', r
4840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_starttag(self, tag, attrs):
4860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
4870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if not attrs:
4880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print 'start tag: <' + tag + '>'
4890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
4900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print 'start tag: <' + tag,
4910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            for name, value in attrs:
4920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                print name + '=' + '"' + value + '"',
4930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print '>'
4940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_endtag(self, tag):
4960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
4970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        print 'end tag: </' + tag + '>'
4980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_entityref(self, ref):
5000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
5010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        print '*** unknown entity ref: &' + ref + ';'
5020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_charref(self, ref):
5040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
5050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        print '*** unknown char ref: &#' + ref + ';'
5060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_decl(self, data):
5080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
5090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        print '*** unknown decl: [' + data + ']'
5100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def close(self):
5120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        SGMLParser.close(self)
5130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.flush()
5140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5160a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef test(args = None):
5170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    import sys
5180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    if args is None:
5200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        args = sys.argv[1:]
5210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    if args and args[0] == '-s':
5230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        args = args[1:]
5240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        klass = SGMLParser
5250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    else:
5260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        klass = TestSGMLParser
5270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    if args:
5290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        file = args[0]
5300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    else:
5310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        file = 'test.html'
5320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    if file == '-':
5340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        f = sys.stdin
5350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    else:
5360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
5370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            f = open(file, 'r')
5380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except IOError, msg:
5390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            print file, ":", msg
5400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            sys.exit(1)
5410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    data = f.read()
5430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    if f is not sys.stdin:
5440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        f.close()
5450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    x = klass()
5470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    for c in data:
5480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        x.feed(c)
5490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    x.close()
5500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
5520a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoif __name__ == '__main__':
5530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    test()
554