1#!/usr/bin/env python3
2'''Add syntax highlighting to Python source code'''
3
4__author__ = 'Raymond Hettinger'
5
6import builtins
7import functools
8import html as html_module
9import keyword
10import re
11import tokenize
12
13#### Analyze Python Source #################################
14
15def is_builtin(s):
16    'Return True if s is the name of a builtin'
17    return hasattr(builtins, s)
18
19def combine_range(lines, start, end):
20    'Join content from a range of lines between start and end'
21    (srow, scol), (erow, ecol) = start, end
22    if srow == erow:
23        return lines[srow-1][scol:ecol], end
24    rows = [lines[srow-1][scol:]] + lines[srow: erow-1] + [lines[erow-1][:ecol]]
25    return ''.join(rows), end
26
27def analyze_python(source):
28    '''Generate and classify chunks of Python for syntax highlighting.
29       Yields tuples in the form: (category, categorized_text).
30    '''
31    lines = source.splitlines(True)
32    lines.append('')
33    readline = functools.partial(next, iter(lines), '')
34    kind = tok_str = ''
35    tok_type = tokenize.COMMENT
36    written = (1, 0)
37    for tok in tokenize.generate_tokens(readline):
38        prev_tok_type, prev_tok_str = tok_type, tok_str
39        tok_type, tok_str, (srow, scol), (erow, ecol), logical_lineno = tok
40        kind = ''
41        if tok_type == tokenize.COMMENT:
42            kind = 'comment'
43        elif tok_type == tokenize.OP and tok_str[:1] not in '{}[](),.:;@':
44            kind = 'operator'
45        elif tok_type == tokenize.STRING:
46            kind = 'string'
47            if prev_tok_type == tokenize.INDENT or scol==0:
48                kind = 'docstring'
49        elif tok_type == tokenize.NAME:
50            if tok_str in ('def', 'class', 'import', 'from'):
51                kind = 'definition'
52            elif prev_tok_str in ('def', 'class'):
53                kind = 'defname'
54            elif keyword.iskeyword(tok_str):
55                kind = 'keyword'
56            elif is_builtin(tok_str) and prev_tok_str != '.':
57                kind = 'builtin'
58        if kind:
59            text, written = combine_range(lines, written, (srow, scol))
60            yield '', text
61            text, written = tok_str, (erow, ecol)
62            yield kind, text
63    line_upto_token, written = combine_range(lines, written, (erow, ecol))
64    yield '', line_upto_token
65
66#### Raw Output  ###########################################
67
68def raw_highlight(classified_text):
69    'Straight text display of text classifications'
70    result = []
71    for kind, text in classified_text:
72        result.append('%15s:  %r\n' % (kind or 'plain', text))
73    return ''.join(result)
74
75#### ANSI Output ###########################################
76
77default_ansi = {
78    'comment': ('\033[0;31m', '\033[0m'),
79    'string': ('\033[0;32m', '\033[0m'),
80    'docstring': ('\033[0;32m', '\033[0m'),
81    'keyword': ('\033[0;33m', '\033[0m'),
82    'builtin': ('\033[0;35m', '\033[0m'),
83    'definition': ('\033[0;33m', '\033[0m'),
84    'defname': ('\033[0;34m', '\033[0m'),
85    'operator': ('\033[0;33m', '\033[0m'),
86}
87
88def ansi_highlight(classified_text, colors=default_ansi):
89    'Add syntax highlighting to source code using ANSI escape sequences'
90    # http://en.wikipedia.org/wiki/ANSI_escape_code
91    result = []
92    for kind, text in classified_text:
93        opener, closer = colors.get(kind, ('', ''))
94        result += [opener, text, closer]
95    return ''.join(result)
96
97#### HTML Output ###########################################
98
99def html_highlight(classified_text,opener='<pre class="python">\n', closer='</pre>\n'):
100    'Convert classified text to an HTML fragment'
101    result = [opener]
102    for kind, text in classified_text:
103        if kind:
104            result.append('<span class="%s">' % kind)
105        result.append(html_module.escape(text))
106        if kind:
107            result.append('</span>')
108    result.append(closer)
109    return ''.join(result)
110
111default_css = {
112    '.comment': '{color: crimson;}',
113    '.string':  '{color: forestgreen;}',
114    '.docstring': '{color: forestgreen; font-style:italic;}',
115    '.keyword': '{color: darkorange;}',
116    '.builtin': '{color: purple;}',
117    '.definition': '{color: darkorange; font-weight:bold;}',
118    '.defname': '{color: blue;}',
119    '.operator': '{color: brown;}',
120}
121
122default_html = '''\
123<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
124          "http://www.w3.org/TR/html4/strict.dtd">
125<html>
126<head>
127<meta http-equiv="Content-type" content="text/html;charset=UTF-8">
128<title> {title} </title>
129<style type="text/css">
130{css}
131</style>
132</head>
133<body>
134{body}
135</body>
136</html>
137'''
138
139def build_html_page(classified_text, title='python',
140                    css=default_css, html=default_html):
141    'Create a complete HTML page with colorized source code'
142    css_str = '\n'.join(['%s %s' % item for item in css.items()])
143    result = html_highlight(classified_text)
144    title = html_module.escape(title)
145    return html.format(title=title, css=css_str, body=result)
146
147#### LaTeX Output ##########################################
148
149default_latex_commands = {
150    'comment': r'{\color{red}#1}',
151    'string': r'{\color{ForestGreen}#1}',
152    'docstring': r'{\emph{\color{ForestGreen}#1}}',
153    'keyword': r'{\color{orange}#1}',
154    'builtin': r'{\color{purple}#1}',
155    'definition': r'{\color{orange}#1}',
156    'defname': r'{\color{blue}#1}',
157    'operator': r'{\color{brown}#1}',
158}
159
160default_latex_document = r'''
161\documentclass{article}
162\usepackage{alltt}
163\usepackage{upquote}
164\usepackage{color}
165\usepackage[usenames,dvipsnames]{xcolor}
166\usepackage[cm]{fullpage}
167%(macros)s
168\begin{document}
169\center{\LARGE{%(title)s}}
170\begin{alltt}
171%(body)s
172\end{alltt}
173\end{document}
174'''
175
176def alltt_escape(s):
177    'Replace backslash and braces with their escaped equivalents'
178    xlat = {'{': r'\{', '}': r'\}', '\\': r'\textbackslash{}'}
179    return re.sub(r'[\\{}]', lambda mo: xlat[mo.group()], s)
180
181def latex_highlight(classified_text, title = 'python',
182                    commands = default_latex_commands,
183                    document = default_latex_document):
184    'Create a complete LaTeX document with colorized source code'
185    macros = '\n'.join(r'\newcommand{\py%s}[1]{%s}' % c for c in commands.items())
186    result = []
187    for kind, text in classified_text:
188        if kind:
189            result.append(r'\py%s{' % kind)
190        result.append(alltt_escape(text))
191        if kind:
192            result.append('}')
193    return default_latex_document % dict(title=title, macros=macros, body=''.join(result))
194
195
196if __name__ == '__main__':
197    import argparse
198    import os.path
199    import sys
200    import textwrap
201    import webbrowser
202
203    parser = argparse.ArgumentParser(
204            description = 'Add syntax highlighting to Python source code',
205            formatter_class=argparse.RawDescriptionHelpFormatter,
206            epilog = textwrap.dedent('''
207                examples:
208
209                  # Show syntax highlighted code in the terminal window
210                  $ ./highlight.py myfile.py
211
212                  # Colorize myfile.py and display in a browser
213                  $ ./highlight.py -b myfile.py
214
215                  # Create an HTML section to embed in an existing webpage
216                  ./highlight.py -s myfile.py
217
218                  # Create a complete HTML file
219                  $ ./highlight.py -c myfile.py > myfile.html
220
221                  # Create a PDF using LaTeX
222                  $ ./highlight.py -l myfile.py | pdflatex
223
224            '''))
225    parser.add_argument('sourcefile', metavar = 'SOURCEFILE',
226            help = 'file containing Python sourcecode')
227    parser.add_argument('-b', '--browser', action = 'store_true',
228            help = 'launch a browser to show results')
229    parser.add_argument('-c', '--complete', action = 'store_true',
230            help = 'build a complete html webpage')
231    parser.add_argument('-l', '--latex', action = 'store_true',
232            help = 'build a LaTeX document')
233    parser.add_argument('-r', '--raw', action = 'store_true',
234            help = 'raw parse of categorized text')
235    parser.add_argument('-s', '--section', action = 'store_true',
236            help = 'show an HTML section rather than a complete webpage')
237    args = parser.parse_args()
238
239    if args.section and (args.browser or args.complete):
240        parser.error('The -s/--section option is incompatible with '
241                     'the -b/--browser or -c/--complete options')
242
243    sourcefile = args.sourcefile
244    with open(sourcefile) as f:
245        source = f.read()
246    classified_text = analyze_python(source)
247
248    if args.raw:
249        encoded = raw_highlight(classified_text)
250    elif args.complete or args.browser:
251        encoded = build_html_page(classified_text, title=sourcefile)
252    elif args.section:
253        encoded = html_highlight(classified_text)
254    elif args.latex:
255        encoded = latex_highlight(classified_text, title=sourcefile)
256    else:
257        encoded = ansi_highlight(classified_text)
258
259    if args.browser:
260        htmlfile = os.path.splitext(os.path.basename(sourcefile))[0] + '.html'
261        with open(htmlfile, 'w') as f:
262            f.write(encoded)
263        webbrowser.open('file://' + os.path.abspath(htmlfile))
264    else:
265        sys.stdout.write(encoded)
266