1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""HTML 2.0 parser. 2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehSee the HTML 2.0 specification: 4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehhttp://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html 5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" 6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom warnings import warnpy3k 8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehwarnpy3k("the htmllib module has been removed in Python 3.0", 9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh stacklevel=2) 10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdel warnpy3k 11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport sgmllib 13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom formatter import AS_IS 15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh__all__ = ["HTMLParser", "HTMLParseError"] 17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass HTMLParseError(sgmllib.SGMLParseError): 20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Error raised when an HTML document can't be parsed.""" 21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass HTMLParser(sgmllib.SGMLParser): 24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """This is the basic HTML parser class. 25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh It supports all entity names required by the XHTML 1.0 Recommendation. 27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elements. 29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh from htmlentitydefs import entitydefs 33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, formatter, verbose=0): 35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Creates an instance of the HTMLParser class. 36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh The formatter parameter is the formatter instance associated with 38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh the parser. 39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh sgmllib.SGMLParser.__init__(self, verbose) 42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter = formatter 43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def error(self, message): 45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise HTMLParseError(message) 46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh sgmllib.SGMLParser.reset(self) 49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.savedata = None 50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.isindex = 0 51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.title = None 52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.base = None 53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchor = None 54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchorlist = [] 55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.nofill = 0 56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.list_stack = [] 57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # ------ Methods used internally; some may be overridden 59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Formatter interface, taking care of 'savedata' mode; 61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # shouldn't need to be overridden 62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def handle_data(self, data): 64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.savedata is not None: 65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.savedata = self.savedata + data 66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.nofill: 68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.add_literal_data(data) 69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.add_flowing_data(data) 71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Hooks to save data; shouldn't need to be overridden 73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def save_bgn(self): 75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Begins saving character data in a buffer instead of sending it 76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh to the formatter object. 77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh Retrieve the stored data via the save_end() method. Use of the 79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh save_bgn() / save_end() pair may not be nested. 80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.savedata = '' 83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def save_end(self): 85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Ends buffering character data and returns all data saved since 86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh the preceding call to the save_bgn() method. 87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh If the nofill flag is false, whitespace is collapsed to single 89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh spaces. A call to this method without a preceding call to the 90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh save_bgn() method will raise a TypeError exception. 91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = self.savedata 94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.savedata = None 95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not self.nofill: 96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = ' '.join(data.split()) 97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return data 98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Hooks for anchors; should probably be overridden 100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def anchor_bgn(self, href, name, type): 102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """This method is called at the start of an anchor region. 103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh The arguments correspond to the attributes of the <A> tag with 105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh the same names. The default implementation maintains a list of 106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh hyperlinks (defined by the HREF attribute for <A> tags) within 107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh the document. The list of hyperlinks is available as the data 108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attribute anchorlist. 109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchor = href 112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.anchor: 113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchorlist.append(href) 114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def anchor_end(self): 116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """This method is called at the end of an anchor region. 117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh The default implementation adds a textual footnote marker using an 119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh index into the list of hyperlinks created by the anchor_bgn()method. 120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.anchor: 123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.handle_data("[%d]" % len(self.anchorlist)) 124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchor = None 125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Hook for images; should probably be overridden 127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def handle_image(self, src, alt, *args): 129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """This method is called to handle images. 130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh The default implementation simply passes the alt value to the 132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh handle_data() method. 133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.handle_data(alt) 136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --------- Top level elememts 138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_html(self, attrs): pass 140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_html(self): pass 141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_head(self, attrs): pass 143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_head(self): pass 144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_body(self, attrs): pass 146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_body(self): pass 147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # ------ Head elements 149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_title(self, attrs): 151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.save_bgn() 152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_title(self): 154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.title = self.save_end() 155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_base(self, attrs): 157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for a, v in attrs: 158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if a == 'href': 159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.base = v 160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_isindex(self, attrs): 162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.isindex = 1 163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_link(self, attrs): 165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_meta(self, attrs): 168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_nextid(self, attrs): # Deprecated 171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # ------ Body elements 174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Headings 176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h1(self, attrs): 178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h1', 0, 1, 0)) 180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h1(self): 182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h2(self, attrs): 186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h2', 0, 1, 0)) 188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h2(self): 190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h3(self, attrs): 194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h3', 0, 1, 0)) 196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h3(self): 198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h4(self, attrs): 202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h4', 0, 1, 0)) 204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h4(self): 206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h5(self, attrs): 210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h5', 0, 1, 0)) 212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h5(self): 214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_h6(self, attrs): 218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font(('h6', 0, 1, 0)) 220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_h6(self): 222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Block Structuring Elements 226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_p(self, attrs): 228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_pre(self, attrs): 231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.nofill = self.nofill + 1 234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_pre(self): 236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.nofill = max(0, self.nofill - 1) 239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_xmp(self, attrs): 241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.start_pre(attrs) 242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.setliteral('xmp') # Tell SGML parser 243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_xmp(self): 245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.end_pre() 246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_listing(self, attrs): 248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.start_pre(attrs) 249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.setliteral('listing') # Tell SGML parser 250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_listing(self): 252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.end_pre() 253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_address(self, attrs): 255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(0) 256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_address(self): 259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(0) 260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_blockquote(self, attrs): 263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_margin('blockquote') 265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_blockquote(self): 267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_margin() 269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- List Elements 271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_ul(self, attrs): 273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(not self.list_stack) 274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_margin('ul') 275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.list_stack.append(['ul', '*', 0]) 276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_ul(self): 278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack: del self.list_stack[-1] 279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(not self.list_stack) 280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_margin() 281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_li(self, attrs): 283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(0) 284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack: 285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh [dummy, label, counter] = top = self.list_stack[-1] 286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh top[2] = counter = counter+1 287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh label, counter = '*', 0 289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.add_label_data(label, counter) 290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_ol(self, attrs): 292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(not self.list_stack) 293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_margin('ol') 294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh label = '1.' 295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for a, v in attrs: 296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if a == 'type': 297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(v) == 1: v = v + '.' 298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh label = v 299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.list_stack.append(['ol', label, 0]) 300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_ol(self): 302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack: del self.list_stack[-1] 303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(not self.list_stack) 304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_margin() 305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_menu(self, attrs): 307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.start_ul(attrs) 308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_menu(self): 310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.end_ul() 311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_dir(self, attrs): 313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.start_ul(attrs) 314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_dir(self): 316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.end_ul() 317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_dl(self, attrs): 319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(1) 320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.list_stack.append(['dl', '', 0]) 321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_dl(self): 323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.ddpop(1) 324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack: del self.list_stack[-1] 325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_dt(self, attrs): 327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.ddpop() 328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_dd(self, attrs): 330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.ddpop() 331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_margin('dd') 332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.list_stack.append(['dd', '', 0]) 333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def ddpop(self, bl=0): 335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.end_paragraph(bl) 336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack: 337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.list_stack[-1][0] == 'dd': 338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del self.list_stack[-1] 339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_margin() 340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Phrase Markup 342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Idiomatic Elements 344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_cite(self, attrs): self.start_i(attrs) 346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_cite(self): self.end_i() 347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_code(self, attrs): self.start_tt(attrs) 349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_code(self): self.end_tt() 350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_em(self, attrs): self.start_i(attrs) 352ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_em(self): self.end_i() 353ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 354ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_kbd(self, attrs): self.start_tt(attrs) 355ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_kbd(self): self.end_tt() 356ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 357ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_samp(self, attrs): self.start_tt(attrs) 358ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_samp(self): self.end_tt() 359ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 360ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_strong(self, attrs): self.start_b(attrs) 361ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_strong(self): self.end_b() 362ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 363ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_var(self, attrs): self.start_i(attrs) 364ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_var(self): self.end_i() 365ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 366ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Typographic Elements 367ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 368ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_i(self, attrs): 369ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 370ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_i(self): 371ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 372ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 373ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_b(self, attrs): 374ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) 375ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_b(self): 376ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 377ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 378ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_tt(self, attrs): 379ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 380ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_tt(self): 381ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.pop_font() 382ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 383ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def start_a(self, attrs): 384ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh href = '' 385ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh name = '' 386ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh type = '' 387ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for attrname, value in attrs: 388ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh value = value.strip() 389ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'href': 390ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh href = value 391ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'name': 392ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh name = value 393ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'type': 394ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh type = value.lower() 395ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchor_bgn(href, name, type) 396ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 397ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def end_a(self): 398ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.anchor_end() 399ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 400ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Line Break 401ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 402ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_br(self, attrs): 403ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.add_line_break() 404ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 405ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Horizontal Rule 406ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 407ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_hr(self, attrs): 408ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.formatter.add_hor_rule() 409ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 410ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Image 411ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 412ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_img(self, attrs): 413ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh align = '' 414ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh alt = '(image)' 415ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ismap = '' 416ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh src = '' 417ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh width = 0 418ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh height = 0 419ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for attrname, value in attrs: 420ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'align': 421ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh align = value 422ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'alt': 423ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh alt = value 424ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'ismap': 425ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ismap = value 426ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'src': 427ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh src = value 428ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'width': 429ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: width = int(value) 430ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except ValueError: pass 431ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if attrname == 'height': 432ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: height = int(value) 433ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except ValueError: pass 434ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.handle_image(src, alt, ismap, align, width, height) 435ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 436ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Really Old Unofficial Deprecated Stuff 437ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 438ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def do_plaintext(self, attrs): 439ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.start_pre(attrs) 440ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.setnomoretags() # Tell SGML parser 441ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 442ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # --- Unhandled tags 443ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 444ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def unknown_starttag(self, tag, attrs): 445ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 446ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 447ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def unknown_endtag(self, tag): 448ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 449ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 450ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 451ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef test(args = None): 452ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh import sys, formatter 453ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 454ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not args: 455ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh args = sys.argv[1:] 456ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 457ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh silent = args and args[0] == '-s' 458ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if silent: 459ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del args[0] 460ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 461ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if args: 462ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh file = args[0] 463ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 464ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh file = 'test.html' 465ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 466ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if file == '-': 467ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f = sys.stdin 468ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 469ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 470ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f = open(file, 'r') 471ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except IOError, msg: 472ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh print file, ":", msg 473ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh sys.exit(1) 474ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 475ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = f.read() 476ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 477ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if f is not sys.stdin: 478ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f.close() 479ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 480ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if silent: 481ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f = formatter.NullFormatter() 482ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 483ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f = formatter.AbstractFormatter(formatter.DumbWriter()) 484ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 485ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p = HTMLParser(f) 486ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p.feed(data) 487ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p.close() 488ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 489ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 490ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehif __name__ == '__main__': 491ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh test() 492