1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
3
4"""Better tokenizing for coverage.py."""
5
6import codecs
7import keyword
8import re
9import sys
10import token
11import tokenize
12
13from coverage import env
14from coverage.backward import iternext
15from coverage.misc import contract
16
17
18def phys_tokens(toks):
19    """Return all physical tokens, even line continuations.
20
21    tokenize.generate_tokens() doesn't return a token for the backslash that
22    continues lines.  This wrapper provides those tokens so that we can
23    re-create a faithful representation of the original source.
24
25    Returns the same values as generate_tokens()
26
27    """
28    last_line = None
29    last_lineno = -1
30    last_ttype = None
31    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
32        if last_lineno != elineno:
33            if last_line and last_line.endswith("\\\n"):
34                # We are at the beginning of a new line, and the last line
35                # ended with a backslash.  We probably have to inject a
36                # backslash token into the stream. Unfortunately, there's more
37                # to figure out.  This code::
38                #
39                #   usage = """\
40                #   HEY THERE
41                #   """
42                #
43                # triggers this condition, but the token text is::
44                #
45                #   '"""\\\nHEY THERE\n"""'
46                #
47                # so we need to figure out if the backslash is already in the
48                # string token or not.
49                inject_backslash = True
50                if last_ttype == tokenize.COMMENT:
51                    # Comments like this \
52                    # should never result in a new token.
53                    inject_backslash = False
54                elif ttype == token.STRING:
55                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
56                        # It's a multi-line string and the first line ends with
57                        # a backslash, so we don't need to inject another.
58                        inject_backslash = False
59                if inject_backslash:
60                    # Figure out what column the backslash is in.
61                    ccol = len(last_line.split("\n")[-2]) - 1
62                    # Yield the token, with a fake token type.
63                    yield (
64                        99999, "\\\n",
65                        (slineno, ccol), (slineno, ccol+2),
66                        last_line
67                        )
68            last_line = ltext
69            last_ttype = ttype
70        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
71        last_lineno = elineno
72
73
74@contract(source='unicode')
75def source_token_lines(source):
76    """Generate a series of lines, one for each line in `source`.
77
78    Each line is a list of pairs, each pair is a token::
79
80        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
81
82    Each pair has a token class, and the token text.
83
84    If you concatenate all the token texts, and then join them with newlines,
85    you should have your original `source` back, with two differences:
86    trailing whitespace is not preserved, and a final line with no newline
87    is indistinguishable from a final line with a newline.
88
89    """
90
91    ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
92    line = []
93    col = 0
94
95    # The \f is because of http://bugs.python.org/issue19035
96    source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ')
97    tokgen = generate_tokens(source)
98
99    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
100        mark_start = True
101        for part in re.split('(\n)', ttext):
102            if part == '\n':
103                yield line
104                line = []
105                col = 0
106                mark_end = False
107            elif part == '':
108                mark_end = False
109            elif ttype in ws_tokens:
110                mark_end = False
111            else:
112                if mark_start and scol > col:
113                    line.append(("ws", u" " * (scol - col)))
114                    mark_start = False
115                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
116                if ttype == token.NAME and keyword.iskeyword(ttext):
117                    tok_class = "key"
118                line.append((tok_class, part))
119                mark_end = True
120            scol = 0
121        if mark_end:
122            col = ecol
123
124    if line:
125        yield line
126
127
128class CachedTokenizer(object):
129    """A one-element cache around tokenize.generate_tokens.
130
131    When reporting, coverage.py tokenizes files twice, once to find the
132    structure of the file, and once to syntax-color it.  Tokenizing is
133    expensive, and easily cached.
134
135    This is a one-element cache so that our twice-in-a-row tokenizing doesn't
136    actually tokenize twice.
137
138    """
139    def __init__(self):
140        self.last_text = None
141        self.last_tokens = None
142
143    @contract(text='unicode')
144    def generate_tokens(self, text):
145        """A stand-in for `tokenize.generate_tokens`."""
146        if text != self.last_text:
147            self.last_text = text
148            readline = iternext(text.splitlines(True))
149            self.last_tokens = list(tokenize.generate_tokens(readline))
150        return self.last_tokens
151
152# Create our generate_tokens cache as a callable replacement function.
153generate_tokens = CachedTokenizer().generate_tokens
154
155
156COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
157
158@contract(source='bytes')
159def _source_encoding_py2(source):
160    """Determine the encoding for `source`, according to PEP 263.
161
162    `source` is a byte string, the text of the program.
163
164    Returns a string, the name of the encoding.
165
166    """
167    assert isinstance(source, bytes)
168
169    # Do this so the detect_encode code we copied will work.
170    readline = iternext(source.splitlines(True))
171
172    # This is mostly code adapted from Py3.2's tokenize module.
173
174    def _get_normal_name(orig_enc):
175        """Imitates get_normal_name in tokenizer.c."""
176        # Only care about the first 12 characters.
177        enc = orig_enc[:12].lower().replace("_", "-")
178        if re.match(r"^utf-8($|-)", enc):
179            return "utf-8"
180        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
181            return "iso-8859-1"
182        return orig_enc
183
184    # From detect_encode():
185    # It detects the encoding from the presence of a UTF-8 BOM or an encoding
186    # cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
187    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
188    # invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
189    # 'utf-8-sig' is returned.
190
191    # If no encoding is specified, then the default will be returned.
192    default = 'ascii'
193
194    bom_found = False
195    encoding = None
196
197    def read_or_stop():
198        """Get the next source line, or ''."""
199        try:
200            return readline()
201        except StopIteration:
202            return ''
203
204    def find_cookie(line):
205        """Find an encoding cookie in `line`."""
206        try:
207            line_string = line.decode('ascii')
208        except UnicodeDecodeError:
209            return None
210
211        matches = COOKIE_RE.findall(line_string)
212        if not matches:
213            return None
214        encoding = _get_normal_name(matches[0])
215        try:
216            codec = codecs.lookup(encoding)
217        except LookupError:
218            # This behavior mimics the Python interpreter
219            raise SyntaxError("unknown encoding: " + encoding)
220
221        if bom_found:
222            # codecs in 2.3 were raw tuples of functions, assume the best.
223            codec_name = getattr(codec, 'name', encoding)
224            if codec_name != 'utf-8':
225                # This behavior mimics the Python interpreter
226                raise SyntaxError('encoding problem: utf-8')
227            encoding += '-sig'
228        return encoding
229
230    first = read_or_stop()
231    if first.startswith(codecs.BOM_UTF8):
232        bom_found = True
233        first = first[3:]
234        default = 'utf-8-sig'
235    if not first:
236        return default
237
238    encoding = find_cookie(first)
239    if encoding:
240        return encoding
241
242    second = read_or_stop()
243    if not second:
244        return default
245
246    encoding = find_cookie(second)
247    if encoding:
248        return encoding
249
250    return default
251
252
253@contract(source='bytes')
254def _source_encoding_py3(source):
255    """Determine the encoding for `source`, according to PEP 263.
256
257    `source` is a byte string: the text of the program.
258
259    Returns a string, the name of the encoding.
260
261    """
262    readline = iternext(source.splitlines(True))
263    return tokenize.detect_encoding(readline)[0]
264
265
266if env.PY3:
267    source_encoding = _source_encoding_py3
268else:
269    source_encoding = _source_encoding_py2
270
271
272@contract(source='unicode')
273def compile_unicode(source, filename, mode):
274    """Just like the `compile` builtin, but works on any Unicode string.
275
276    Python 2's compile() builtin has a stupid restriction: if the source string
277    is Unicode, then it may not have a encoding declaration in it.  Why not?
278    Who knows!  It also decodes to utf8, and then tries to interpret those utf8
279    bytes according to the encoding declaration.  Why? Who knows!
280
281    This function neuters the coding declaration, and compiles it.
282
283    """
284    source = neuter_encoding_declaration(source)
285    if env.PY2 and isinstance(filename, unicode):
286        filename = filename.encode(sys.getfilesystemencoding(), "replace")
287    code = compile(source, filename, mode)
288    return code
289
290
291@contract(source='unicode', returns='unicode')
292def neuter_encoding_declaration(source):
293    """Return `source`, with any encoding declaration neutered."""
294    source = COOKIE_RE.sub("# (deleted declaration)", source, count=1)
295    return source
296