1#!/usr/bin/env python 2"""usage: %prog [options] filename 3 4Parse a document to a tree, with optional profiling 5""" 6 7import sys 8import os 9import traceback 10from optparse import OptionParser 11 12from html5lib import html5parser, sanitizer 13from html5lib.tokenizer import HTMLTokenizer 14from html5lib import treebuilders, serializer, treewalkers 15from html5lib import constants 16from html5lib import utils 17 18def parse(): 19 optParser = getOptParser() 20 opts,args = optParser.parse_args() 21 encoding = "utf8" 22 23 try: 24 f = args[-1] 25 # Try opening from the internet 26 if f.startswith('http://'): 27 try: 28 import urllib.request, urllib.parse, urllib.error, cgi 29 f = urllib.request.urlopen(f) 30 contentType = f.headers.get('content-type') 31 if contentType: 32 (mediaType, params) = cgi.parse_header(contentType) 33 encoding = params.get('charset') 34 except: 35 pass 36 elif f == '-': 37 f = sys.stdin 38 if sys.version_info[0] >= 3: 39 encoding = None 40 else: 41 try: 42 # Try opening from file system 43 f = open(f, "rb") 44 except IOError as e: 45 sys.stderr.write("Unable to open file: %s\n" % e) 46 sys.exit(1) 47 except IndexError: 48 sys.stderr.write("No filename provided. Use -h for help\n") 49 sys.exit(1) 50 51 treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) 52 53 if opts.sanitize: 54 tokenizer = sanitizer.HTMLSanitizer 55 else: 56 tokenizer = HTMLTokenizer 57 58 p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log) 59 60 if opts.fragment: 61 parseMethod = p.parseFragment 62 else: 63 parseMethod = p.parse 64 65 if opts.profile: 66 import cProfile 67 import pstats 68 cProfile.runctx("run(parseMethod, f, encoding)", None, 69 {"run": run, 70 "parseMethod": parseMethod, 71 "f": f, 72 "encoding": encoding}, 73 "stats.prof") 74 # XXX - We should use a temp file here 75 stats = pstats.Stats('stats.prof') 76 stats.strip_dirs() 77 stats.sort_stats('time') 78 stats.print_stats() 79 elif opts.time: 80 import time 81 t0 = time.time() 82 document = run(parseMethod, f, encoding) 83 t1 = time.time() 84 if document: 85 printOutput(p, document, opts) 86 t2 = time.time() 87 sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) 88 else: 89 sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) 90 else: 91 document = run(parseMethod, f, encoding) 92 if document: 93 printOutput(p, document, opts) 94 95def run(parseMethod, f, encoding): 96 try: 97 document = parseMethod(f, encoding=encoding) 98 except: 99 document = None 100 traceback.print_exc() 101 return document 102 103def printOutput(parser, document, opts): 104 if opts.encoding: 105 print("Encoding:", parser.tokenizer.stream.charEncoding) 106 107 for item in parser.log: 108 print(item) 109 110 if document is not None: 111 if opts.xml: 112 tb = opts.treebuilder.lower() 113 if tb == "dom": 114 document.writexml(sys.stdout, encoding="utf-8") 115 elif tb == "lxml": 116 import lxml.etree 117 sys.stdout.write(lxml.etree.tostring(document)) 118 elif tb == "etree": 119 sys.stdout.write(utils.default_etree.tostring(document)) 120 elif opts.tree: 121 if not hasattr(document,'__getitem__'): 122 document = [document] 123 for fragment in document: 124 print(parser.tree.testSerializer(fragment)) 125 elif opts.hilite: 126 sys.stdout.write(document.hilite("utf-8")) 127 elif opts.html: 128 kwargs = {} 129 for opt in serializer.HTMLSerializer.options: 130 try: 131 kwargs[opt] = getattr(opts,opt) 132 except: 133 pass 134 if not kwargs['quote_char']: 135 del kwargs['quote_char'] 136 137 tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) 138 if sys.version_info[0] >= 3: 139 encoding = None 140 else: 141 encoding = "utf-8" 142 for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): 143 sys.stdout.write(text) 144 if not text.endswith('\n'): sys.stdout.write('\n') 145 if opts.error: 146 errList=[] 147 for pos, errorcode, datavars in parser.errors: 148 errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) 149 sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") 150 151def getOptParser(): 152 parser = OptionParser(usage=__doc__) 153 154 parser.add_option("-p", "--profile", action="store_true", default=False, 155 dest="profile", help="Use the hotshot profiler to " 156 "produce a detailed log of the run") 157 158 parser.add_option("-t", "--time", 159 action="store_true", default=False, dest="time", 160 help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") 161 162 parser.add_option("-b", "--treebuilder", action="store", type="string", 163 dest="treebuilder", default="etree") 164 165 parser.add_option("-e", "--error", action="store_true", default=False, 166 dest="error", help="Print a list of parse errors") 167 168 parser.add_option("-f", "--fragment", action="store_true", default=False, 169 dest="fragment", help="Parse as a fragment") 170 171 parser.add_option("", "--tree", action="store_true", default=False, 172 dest="tree", help="Output as debug tree") 173 174 parser.add_option("-x", "--xml", action="store_true", default=False, 175 dest="xml", help="Output as xml") 176 177 parser.add_option("", "--no-html", action="store_false", default=True, 178 dest="html", help="Don't output html") 179 180 parser.add_option("", "--hilite", action="store_true", default=False, 181 dest="hilite", help="Output as formatted highlighted code.") 182 183 parser.add_option("-c", "--encoding", action="store_true", default=False, 184 dest="encoding", help="Print character encoding used") 185 186 parser.add_option("", "--inject-meta-charset", action="store_true", 187 default=False, dest="inject_meta_charset", 188 help="inject <meta charset>") 189 190 parser.add_option("", "--strip-whitespace", action="store_true", 191 default=False, dest="strip_whitespace", 192 help="strip whitespace") 193 194 parser.add_option("", "--omit-optional-tags", action="store_true", 195 default=False, dest="omit_optional_tags", 196 help="omit optional tags") 197 198 parser.add_option("", "--quote-attr-values", action="store_true", 199 default=False, dest="quote_attr_values", 200 help="quote attribute values") 201 202 parser.add_option("", "--use-best-quote-char", action="store_true", 203 default=False, dest="use_best_quote_char", 204 help="use best quote character") 205 206 parser.add_option("", "--quote-char", action="store", 207 default=None, dest="quote_char", 208 help="quote character") 209 210 parser.add_option("", "--no-minimize-boolean-attributes", 211 action="store_false", default=True, 212 dest="minimize_boolean_attributes", 213 help="minimize boolean attributes") 214 215 parser.add_option("", "--use-trailing-solidus", action="store_true", 216 default=False, dest="use_trailing_solidus", 217 help="use trailing solidus") 218 219 parser.add_option("", "--space-before-trailing-solidus", 220 action="store_true", default=False, 221 dest="space_before_trailing_solidus", 222 help="add space before trailing solidus") 223 224 parser.add_option("", "--escape-lt-in-attrs", action="store_true", 225 default=False, dest="escape_lt_in_attrs", 226 help="escape less than signs in attribute values") 227 228 parser.add_option("", "--escape-rcdata", action="store_true", 229 default=False, dest="escape_rcdata", 230 help="escape rcdata element values") 231 232 parser.add_option("", "--sanitize", action="store_true", default=False, 233 dest="sanitize", help="sanitize") 234 235 parser.add_option("-l", "--log", action="store_true", default=False, 236 dest="log", help="log state transitions") 237 238 return parser 239 240if __name__ == "__main__": 241 parse() 242