1#!/usr/bin/env python
2"""usage: %prog [options] filename
3
4Parse a document to a tree, with optional profiling
5"""
6
7import sys
8import os
9import traceback
10from optparse import OptionParser
11
12from html5lib import html5parser, sanitizer
13from html5lib.tokenizer import HTMLTokenizer
14from html5lib import treebuilders, serializer, treewalkers
15from html5lib import constants
16from html5lib import utils
17
18def parse():
19    optParser = getOptParser()
20    opts,args = optParser.parse_args()
21    encoding = "utf8"
22
23    try:
24        f = args[-1]
25        # Try opening from the internet
26        if f.startswith('http://'):
27            try:
28                import urllib.request, urllib.parse, urllib.error, cgi
29                f = urllib.request.urlopen(f)
30                contentType = f.headers.get('content-type')
31                if contentType:
32                    (mediaType, params) = cgi.parse_header(contentType)
33                    encoding = params.get('charset')
34            except:
35                pass
36        elif f == '-':
37            f = sys.stdin
38            if sys.version_info[0] >= 3:
39                encoding = None
40        else:
41            try:
42                # Try opening from file system
43                f = open(f, "rb")
44            except IOError as e:
45                sys.stderr.write("Unable to open file: %s\n" % e)
46                sys.exit(1)
47    except IndexError:
48        sys.stderr.write("No filename provided. Use -h for help\n")
49        sys.exit(1)
50
51    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
52
53    if opts.sanitize:
54        tokenizer = sanitizer.HTMLSanitizer
55    else:
56        tokenizer = HTMLTokenizer
57
58    p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)
59
60    if opts.fragment:
61        parseMethod = p.parseFragment
62    else:
63        parseMethod = p.parse
64
65    if opts.profile:
66        import cProfile
67        import pstats
68        cProfile.runctx("run(parseMethod, f, encoding)", None,
69                        {"run": run,
70                         "parseMethod": parseMethod,
71                         "f": f,
72                         "encoding": encoding},
73                        "stats.prof")
74        # XXX - We should use a temp file here
75        stats = pstats.Stats('stats.prof')
76        stats.strip_dirs()
77        stats.sort_stats('time')
78        stats.print_stats()
79    elif opts.time:
80        import time
81        t0 = time.time()
82        document = run(parseMethod, f, encoding)
83        t1 = time.time()
84        if document:
85            printOutput(p, document, opts)
86            t2 = time.time()
87            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
88        else:
89            sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
90    else:
91        document = run(parseMethod, f, encoding)
92        if document:
93            printOutput(p, document, opts)
94
95def run(parseMethod, f, encoding):
96    try:
97        document = parseMethod(f, encoding=encoding)
98    except:
99        document = None
100        traceback.print_exc()
101    return document
102
103def printOutput(parser, document, opts):
104    if opts.encoding:
105        print("Encoding:", parser.tokenizer.stream.charEncoding)
106
107    for item in parser.log:
108        print(item)
109
110    if document is not None:
111        if opts.xml:
112            tb = opts.treebuilder.lower()
113            if tb == "dom":
114                document.writexml(sys.stdout, encoding="utf-8")
115            elif tb == "lxml":
116                import lxml.etree
117                sys.stdout.write(lxml.etree.tostring(document))
118            elif tb == "etree":
119                sys.stdout.write(utils.default_etree.tostring(document))
120        elif opts.tree:
121            if not hasattr(document,'__getitem__'):
122                document = [document]
123            for fragment in document:
124                print(parser.tree.testSerializer(fragment))
125        elif opts.hilite:
126            sys.stdout.write(document.hilite("utf-8"))
127        elif opts.html:
128            kwargs = {}
129            for opt in serializer.HTMLSerializer.options:
130                try:
131                    kwargs[opt] = getattr(opts,opt)
132                except:
133                    pass
134            if not kwargs['quote_char']:
135                del kwargs['quote_char']
136
137            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
138            if sys.version_info[0] >= 3:
139                encoding = None
140            else:
141                encoding = "utf-8"
142            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
143                sys.stdout.write(text)
144            if not text.endswith('\n'): sys.stdout.write('\n')
145    if opts.error:
146        errList=[]
147        for pos, errorcode, datavars in parser.errors:
148            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
149        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
150
151def getOptParser():
152    parser = OptionParser(usage=__doc__)
153
154    parser.add_option("-p", "--profile", action="store_true", default=False,
155                      dest="profile", help="Use the hotshot profiler to "
156                      "produce a detailed log of the run")
157
158    parser.add_option("-t", "--time",
159                      action="store_true", default=False, dest="time",
160                      help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
161
162    parser.add_option("-b", "--treebuilder", action="store", type="string",
163                      dest="treebuilder", default="etree")
164
165    parser.add_option("-e", "--error", action="store_true", default=False,
166                      dest="error", help="Print a list of parse errors")
167
168    parser.add_option("-f", "--fragment", action="store_true", default=False,
169                      dest="fragment", help="Parse as a fragment")
170
171    parser.add_option("", "--tree", action="store_true", default=False,
172                      dest="tree", help="Output as debug tree")
173
174    parser.add_option("-x", "--xml", action="store_true", default=False,
175                      dest="xml", help="Output as xml")
176
177    parser.add_option("", "--no-html", action="store_false", default=True,
178                      dest="html", help="Don't output html")
179
180    parser.add_option("", "--hilite", action="store_true", default=False,
181                      dest="hilite", help="Output as formatted highlighted code.")
182
183    parser.add_option("-c", "--encoding", action="store_true", default=False,
184                      dest="encoding", help="Print character encoding used")
185
186    parser.add_option("", "--inject-meta-charset", action="store_true",
187                      default=False, dest="inject_meta_charset",
188                      help="inject <meta charset>")
189
190    parser.add_option("", "--strip-whitespace", action="store_true",
191                      default=False, dest="strip_whitespace",
192                      help="strip whitespace")
193
194    parser.add_option("", "--omit-optional-tags", action="store_true",
195                      default=False, dest="omit_optional_tags",
196                      help="omit optional tags")
197
198    parser.add_option("", "--quote-attr-values", action="store_true",
199                      default=False, dest="quote_attr_values",
200                      help="quote attribute values")
201
202    parser.add_option("", "--use-best-quote-char", action="store_true",
203                      default=False, dest="use_best_quote_char",
204                      help="use best quote character")
205
206    parser.add_option("", "--quote-char", action="store",
207                      default=None, dest="quote_char",
208                      help="quote character")
209
210    parser.add_option("", "--no-minimize-boolean-attributes",
211                      action="store_false", default=True,
212                      dest="minimize_boolean_attributes",
213                      help="minimize boolean attributes")
214
215    parser.add_option("", "--use-trailing-solidus", action="store_true",
216                      default=False, dest="use_trailing_solidus",
217                      help="use trailing solidus")
218
219    parser.add_option("", "--space-before-trailing-solidus",
220                      action="store_true", default=False,
221                      dest="space_before_trailing_solidus",
222                      help="add space before trailing solidus")
223
224    parser.add_option("", "--escape-lt-in-attrs", action="store_true",
225                      default=False, dest="escape_lt_in_attrs",
226                      help="escape less than signs in attribute values")
227
228    parser.add_option("", "--escape-rcdata", action="store_true",
229                      default=False, dest="escape_rcdata",
230                      help="escape rcdata element values")
231
232    parser.add_option("", "--sanitize", action="store_true", default=False,
233                      dest="sanitize", help="sanitize")
234
235    parser.add_option("-l", "--log", action="store_true", default=False,
236                      dest="log", help="log state transitions")
237
238    return parser
239
240if __name__ == "__main__":
241    parse()
242