1#!/usr/bin/env python
2#
3# Copyright 2007 Neal Norwitz
4# Portions Copyright 2007 Google Inc.
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#      http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""Tokenize C++ source code."""
19
20try:
21    # Python 3.x
22    import builtins
23except ImportError:
24    # Python 2.x
25    import __builtin__ as builtins
26
27
28import sys
29
30from cpp import utils
31
32
33if not hasattr(builtins, 'set'):
34    # Nominal support for Python 2.3.
35    from sets import Set as set
36
37
38# Add $ as a valid identifier char since so much code uses it.
39_letters = 'abcdefghijklmnopqrstuvwxyz'
40VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
41HEX_DIGITS = set('0123456789abcdefABCDEF')
42INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
43
44
45# C++0x string preffixes.
46_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
47
48
49# Token types.
50UNKNOWN = 'UNKNOWN'
51SYNTAX = 'SYNTAX'
52CONSTANT = 'CONSTANT'
53NAME = 'NAME'
54PREPROCESSOR = 'PREPROCESSOR'
55
56# Where the token originated from.  This can be used for backtracking.
57# It is always set to WHENCE_STREAM in this code.
58WHENCE_STREAM, WHENCE_QUEUE = range(2)
59
60
61class Token(object):
62    """Data container to represent a C++ token.
63
64    Tokens can be identifiers, syntax char(s), constants, or
65    pre-processor directives.
66
67    start contains the index of the first char of the token in the source
68    end contains the index of the last char of the token in the source
69    """
70
71    def __init__(self, token_type, name, start, end):
72        self.token_type = token_type
73        self.name = name
74        self.start = start
75        self.end = end
76        self.whence = WHENCE_STREAM
77
78    def __str__(self):
79        if not utils.DEBUG:
80            return 'Token(%r)' % self.name
81        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
82
83    __repr__ = __str__
84
85
86def _GetString(source, start, i):
87    i = source.find('"', i+1)
88    while source[i-1] == '\\':
89        # Count the trailing backslashes.
90        backslash_count = 1
91        j = i - 2
92        while source[j] == '\\':
93            backslash_count += 1
94            j -= 1
95        # When trailing backslashes are even, they escape each other.
96        if (backslash_count % 2) == 0:
97            break
98        i = source.find('"', i+1)
99    return i + 1
100
101
102def _GetChar(source, start, i):
103    # NOTE(nnorwitz): may not be quite correct, should be good enough.
104    i = source.find("'", i+1)
105    while source[i-1] == '\\':
106        # Need to special case '\\'.
107        if (i - 2) > start and source[i-2] == '\\':
108            break
109        i = source.find("'", i+1)
110    # Try to handle unterminated single quotes (in a #if 0 block).
111    if i < 0:
112        i = start
113    return i + 1
114
115
116def GetTokens(source):
117    """Returns a sequence of Tokens.
118
119    Args:
120      source: string of C++ source code.
121
122    Yields:
123      Token that represents the next token in the source.
124    """
125    # Cache various valid character sets for speed.
126    valid_identifier_chars = VALID_IDENTIFIER_CHARS
127    hex_digits = HEX_DIGITS
128    int_or_float_digits = INT_OR_FLOAT_DIGITS
129    int_or_float_digits2 = int_or_float_digits | set('.')
130
131    # Only ignore errors while in a #if 0 block.
132    ignore_errors = False
133    count_ifs = 0
134
135    i = 0
136    end = len(source)
137    while i < end:
138        # Skip whitespace.
139        while i < end and source[i].isspace():
140            i += 1
141        if i >= end:
142            return
143
144        token_type = UNKNOWN
145        start = i
146        c = source[i]
147        if c.isalpha() or c == '_':              # Find a string token.
148            token_type = NAME
149            while source[i] in valid_identifier_chars:
150                i += 1
151            # String and character constants can look like a name if
152            # they are something like L"".
153            if (source[i] == "'" and (i - start) == 1 and
154                source[start:i] in 'uUL'):
155                # u, U, and L are valid C++0x character preffixes.
156                token_type = CONSTANT
157                i = _GetChar(source, start, i)
158            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
159                token_type = CONSTANT
160                i = _GetString(source, start, i)
161        elif c == '/' and source[i+1] == '/':    # Find // comments.
162            i = source.find('\n', i)
163            if i == -1:  # Handle EOF.
164                i = end
165            continue
166        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
167            i = source.find('*/', i) + 2
168            continue
169        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
170            token_type = SYNTAX
171            i += 1
172            new_ch = source[i]
173            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
174                i += 1
175            elif c == '-' and new_ch == '>':
176                i += 1
177            elif new_ch == '=':
178                i += 1
179        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
180            token_type = SYNTAX
181            i += 1
182            if c == '.' and source[i].isdigit():
183                token_type = CONSTANT
184                i += 1
185                while source[i] in int_or_float_digits:
186                    i += 1
187                # Handle float suffixes.
188                for suffix in ('l', 'f'):
189                    if suffix == source[i:i+1].lower():
190                        i += 1
191                        break
192        elif c.isdigit():                        # Find integer.
193            token_type = CONSTANT
194            if c == '0' and source[i+1] in 'xX':
195                # Handle hex digits.
196                i += 2
197                while source[i] in hex_digits:
198                    i += 1
199            else:
200                while source[i] in int_or_float_digits2:
201                    i += 1
202            # Handle integer (and float) suffixes.
203            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
204                size = len(suffix)
205                if suffix == source[i:i+size].lower():
206                    i += size
207                    break
208        elif c == '"':                           # Find string.
209            token_type = CONSTANT
210            i = _GetString(source, start, i)
211        elif c == "'":                           # Find char.
212            token_type = CONSTANT
213            i = _GetChar(source, start, i)
214        elif c == '#':                           # Find pre-processor command.
215            token_type = PREPROCESSOR
216            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
217            if got_if:
218                count_ifs += 1
219            elif source[i:i+6] == '#endif':
220                count_ifs -= 1
221                if count_ifs == 0:
222                    ignore_errors = False
223
224            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
225            while 1:
226                i1 = source.find('\n', i)
227                i2 = source.find('//', i)
228                i3 = source.find('/*', i)
229                i4 = source.find('"', i)
230                # NOTE(nnorwitz): doesn't handle comments in #define macros.
231                # Get the first important symbol (newline, comment, EOF/end).
232                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
233
234                # Handle #include "dir//foo.h" properly.
235                if source[i] == '"':
236                    i = source.find('"', i+1) + 1
237                    assert i > 0
238                    continue
239                # Keep going if end of the line and the line ends with \.
240                if not (i == i1 and source[i-1] == '\\'):
241                    if got_if:
242                        condition = source[start+4:i].lstrip()
243                        if (condition.startswith('0') or
244                            condition.startswith('(0)')):
245                            ignore_errors = True
246                    break
247                i += 1
248        elif c == '\\':                          # Handle \ in code.
249            # This is different from the pre-processor \ handling.
250            i += 1
251            continue
252        elif ignore_errors:
253            # The tokenizer seems to be in pretty good shape.  This
254            # raise is conditionally disabled so that bogus code
255            # in an #if 0 block can be handled.  Since we will ignore
256            # it anyways, this is probably fine.  So disable the
257            # exception and  return the bogus char.
258            i += 1
259        else:
260            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
261                             ('?', i, c, source[i-10:i+10]))
262            raise RuntimeError('unexpected token')
263
264        if i <= 0:
265            print('Invalid index, exiting now.')
266            return
267        yield Token(token_type, source[start:i], start, i)
268
269
270if __name__ == '__main__':
271    def main(argv):
272        """Driver mostly for testing purposes."""
273        for filename in argv[1:]:
274            source = utils.ReadFile(filename)
275            if source is None:
276                continue
277
278            for token in GetTokens(source):
279                print('%-12s: %s' % (token.token_type, token.name))
280                # print('\r%6.2f%%' % (100.0 * index / token.end),)
281            sys.stdout.write('\n')
282
283
284    main(sys.argv)
285