1#------------------------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
6# Eli Bendersky [https://eli.thegreenplace.net/]
7# License: BSD
8#------------------------------------------------------------------------------
9import re
10import sys
11
12from .ply import lex
13from .ply.lex import TOKEN
14
15
16class CLexer(object):
17    """ A lexer for the C language. After building it, set the
18        input text with input(), and call token() to get new
19        tokens.
20
21        The public attribute filename can be set to an initial
22        filename, but the lexer will update it upon #line
23        directives.
24    """
25    def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
26                 type_lookup_func):
27        """ Create a new Lexer.
28
29            error_func:
30                An error function. Will be called with an error
31                message, line and column as arguments, in case of
32                an error during lexing.
33
34            on_lbrace_func, on_rbrace_func:
35                Called when an LBRACE or RBRACE is encountered
36                (likely to push/pop type_lookup_func's scope)
37
38            type_lookup_func:
39                A type lookup function. Given a string, it must
40                return True IFF this string is a name of a type
41                that was defined with a typedef earlier.
42        """
43        self.error_func = error_func
44        self.on_lbrace_func = on_lbrace_func
45        self.on_rbrace_func = on_rbrace_func
46        self.type_lookup_func = type_lookup_func
47        self.filename = ''
48
49        # Keeps track of the last token returned from self.token()
50        self.last_token = None
51
52        # Allow either "# line" or "# <num>" to support GCC's
53        # cpp output
54        #
55        self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)')
56        self.pragma_pattern = re.compile(r'[ \t]*pragma\W')
57
58    def build(self, **kwargs):
59        """ Builds the lexer from the specification. Must be
60            called after the lexer object is created.
61
62            This method exists separately, because the PLY
63            manual warns against calling lex.lex inside
64            __init__
65        """
66        self.lexer = lex.lex(object=self, **kwargs)
67
68    def reset_lineno(self):
69        """ Resets the internal line number counter of the lexer.
70        """
71        self.lexer.lineno = 1
72
73    def input(self, text):
74        self.lexer.input(text)
75
76    def token(self):
77        self.last_token = self.lexer.token()
78        return self.last_token
79
80    def find_tok_column(self, token):
81        """ Find the column of the token in its line.
82        """
83        last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
84        return token.lexpos - last_cr
85
86    ######################--   PRIVATE   --######################
87
88    ##
89    ## Internal auxiliary methods
90    ##
91    def _error(self, msg, token):
92        location = self._make_tok_location(token)
93        self.error_func(msg, location[0], location[1])
94        self.lexer.skip(1)
95
96    def _make_tok_location(self, token):
97        return (token.lineno, self.find_tok_column(token))
98
99    ##
100    ## Reserved keywords
101    ##
102    keywords = (
103        '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
104        'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
105        'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
106        'REGISTER', 'OFFSETOF',
107        'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
108        'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
109        'VOLATILE', 'WHILE', '__INT128',
110    )
111
112    keyword_map = {}
113    for keyword in keywords:
114        if keyword == '_BOOL':
115            keyword_map['_Bool'] = keyword
116        elif keyword == '_COMPLEX':
117            keyword_map['_Complex'] = keyword
118        else:
119            keyword_map[keyword.lower()] = keyword
120
121    ##
122    ## All the tokens recognized by the lexer
123    ##
124    tokens = keywords + (
125        # Identifiers
126        'ID',
127
128        # Type identifiers (identifiers previously defined as
129        # types with typedef)
130        'TYPEID',
131
132        # constants
133        'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR',
134        'FLOAT_CONST', 'HEX_FLOAT_CONST',
135        'CHAR_CONST',
136        'WCHAR_CONST',
137
138        # String literals
139        'STRING_LITERAL',
140        'WSTRING_LITERAL',
141
142        # Operators
143        'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
144        'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
145        'LOR', 'LAND', 'LNOT',
146        'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
147
148        # Assignment
149        'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
150        'PLUSEQUAL', 'MINUSEQUAL',
151        'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
152        'OREQUAL',
153
154        # Increment/decrement
155        'PLUSPLUS', 'MINUSMINUS',
156
157        # Structure dereference (->)
158        'ARROW',
159
160        # Conditional operator (?)
161        'CONDOP',
162
163        # Delimeters
164        'LPAREN', 'RPAREN',         # ( )
165        'LBRACKET', 'RBRACKET',     # [ ]
166        'LBRACE', 'RBRACE',         # { }
167        'COMMA', 'PERIOD',          # . ,
168        'SEMI', 'COLON',            # ; :
169
170        # Ellipsis (...)
171        'ELLIPSIS',
172
173        # pre-processor
174        'PPHASH',       # '#'
175        'PPPRAGMA',     # 'pragma'
176        'PPPRAGMASTR',
177    )
178
179    ##
180    ## Regexes for use in tokens
181    ##
182    ##
183
184    # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
185    identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
186
187    hex_prefix = '0[xX]'
188    hex_digits = '[0-9a-fA-F]+'
189    bin_prefix = '0[bB]'
190    bin_digits = '[01]+'
191
192    # integer constants (K&R2: A.2.5.1)
193    integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
194    decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
195    octal_constant = '0[0-7]*'+integer_suffix_opt
196    hex_constant = hex_prefix+hex_digits+integer_suffix_opt
197    bin_constant = bin_prefix+bin_digits+integer_suffix_opt
198
199    bad_octal_constant = '0[0-7]*[89]'
200
201    # character constants (K&R2: A.2.5.2)
202    # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
203    # directives with Windows paths as filenames (..\..\dir\file)
204    # For the same reason, decimal_escape allows all digit sequences. We want to
205    # parse all correct code, even if it means to sometimes parse incorrect
206    # code.
207    #
208    # The original regexes were taken verbatim from the C syntax definition,
209    # and were later modified to avoid worst-case exponential running time.
210    #
211    #   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
212    #   decimal_escape = r"""(\d+)"""
213    #   hex_escape = r"""(x[0-9a-fA-F]+)"""
214    #   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
215    #
216    # The following modifications were made to avoid the ambiguity that allowed backtracking:
217    # (https://github.com/eliben/pycparser/issues/61)
218    #
219    # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
220    # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
221    # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
222    # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
223    #
224    # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
225    # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
226
227    simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
228    decimal_escape = r"""(\d+)(?!\d)"""
229    hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
230    bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
231
232    escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
233
234    # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
235    # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
236
237    escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
238
239    cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
240    char_const = "'"+cconst_char+"'"
241    wchar_const = 'L'+char_const
242    multicharacter_constant = "'"+cconst_char+"{2,4}'"
243    unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
244    bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
245
246    # string literals (K&R2: A.2.6)
247    string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
248    string_literal = '"'+string_char+'*"'
249    wstring_literal = 'L'+string_literal
250    bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
251
252    # floating constants (K&R2: A.2.5.3)
253    exponent_part = r"""([eE][-+]?[0-9]+)"""
254    fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
255    floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
256    binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
257    hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
258    hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
259
260    ##
261    ## Lexer states: used for preprocessor \n-terminated directives
262    ##
263    states = (
264        # ppline: preprocessor line directives
265        #
266        ('ppline', 'exclusive'),
267
268        # pppragma: pragma
269        #
270        ('pppragma', 'exclusive'),
271    )
272
273    def t_PPHASH(self, t):
274        r'[ \t]*\#'
275        if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
276            t.lexer.begin('ppline')
277            self.pp_line = self.pp_filename = None
278        elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
279            t.lexer.begin('pppragma')
280        else:
281            t.type = 'PPHASH'
282            return t
283
284    ##
285    ## Rules for the ppline state
286    ##
287    @TOKEN(string_literal)
288    def t_ppline_FILENAME(self, t):
289        if self.pp_line is None:
290            self._error('filename before line number in #line', t)
291        else:
292            self.pp_filename = t.value.lstrip('"').rstrip('"')
293
294    @TOKEN(decimal_constant)
295    def t_ppline_LINE_NUMBER(self, t):
296        if self.pp_line is None:
297            self.pp_line = t.value
298        else:
299            # Ignore: GCC's cpp sometimes inserts a numeric flag
300            # after the file name
301            pass
302
303    def t_ppline_NEWLINE(self, t):
304        r'\n'
305        if self.pp_line is None:
306            self._error('line number missing in #line', t)
307        else:
308            self.lexer.lineno = int(self.pp_line)
309
310            if self.pp_filename is not None:
311                self.filename = self.pp_filename
312
313        t.lexer.begin('INITIAL')
314
315    def t_ppline_PPLINE(self, t):
316        r'line'
317        pass
318
319    t_ppline_ignore = ' \t'
320
321    def t_ppline_error(self, t):
322        self._error('invalid #line directive', t)
323
324    ##
325    ## Rules for the pppragma state
326    ##
327    def t_pppragma_NEWLINE(self, t):
328        r'\n'
329        t.lexer.lineno += 1
330        t.lexer.begin('INITIAL')
331
332    def t_pppragma_PPPRAGMA(self, t):
333        r'pragma'
334        return t
335
336    t_pppragma_ignore = ' \t'
337
338    def t_pppragma_STR(self, t):
339        '.+'
340        t.type = 'PPPRAGMASTR'
341        return t
342
343    def t_pppragma_error(self, t):
344        self._error('invalid #pragma directive', t)
345
346    ##
347    ## Rules for the normal state
348    ##
349    t_ignore = ' \t'
350
351    # Newlines
352    def t_NEWLINE(self, t):
353        r'\n+'
354        t.lexer.lineno += t.value.count("\n")
355
356    # Operators
357    t_PLUS              = r'\+'
358    t_MINUS             = r'-'
359    t_TIMES             = r'\*'
360    t_DIVIDE            = r'/'
361    t_MOD               = r'%'
362    t_OR                = r'\|'
363    t_AND               = r'&'
364    t_NOT               = r'~'
365    t_XOR               = r'\^'
366    t_LSHIFT            = r'<<'
367    t_RSHIFT            = r'>>'
368    t_LOR               = r'\|\|'
369    t_LAND              = r'&&'
370    t_LNOT              = r'!'
371    t_LT                = r'<'
372    t_GT                = r'>'
373    t_LE                = r'<='
374    t_GE                = r'>='
375    t_EQ                = r'=='
376    t_NE                = r'!='
377
378    # Assignment operators
379    t_EQUALS            = r'='
380    t_TIMESEQUAL        = r'\*='
381    t_DIVEQUAL          = r'/='
382    t_MODEQUAL          = r'%='
383    t_PLUSEQUAL         = r'\+='
384    t_MINUSEQUAL        = r'-='
385    t_LSHIFTEQUAL       = r'<<='
386    t_RSHIFTEQUAL       = r'>>='
387    t_ANDEQUAL          = r'&='
388    t_OREQUAL           = r'\|='
389    t_XOREQUAL          = r'\^='
390
391    # Increment/decrement
392    t_PLUSPLUS          = r'\+\+'
393    t_MINUSMINUS        = r'--'
394
395    # ->
396    t_ARROW             = r'->'
397
398    # ?
399    t_CONDOP            = r'\?'
400
401    # Delimeters
402    t_LPAREN            = r'\('
403    t_RPAREN            = r'\)'
404    t_LBRACKET          = r'\['
405    t_RBRACKET          = r'\]'
406    t_COMMA             = r','
407    t_PERIOD            = r'\.'
408    t_SEMI              = r';'
409    t_COLON             = r':'
410    t_ELLIPSIS          = r'\.\.\.'
411
412    # Scope delimiters
413    # To see why on_lbrace_func is needed, consider:
414    #   typedef char TT;
415    #   void foo(int TT) { TT = 10; }
416    #   TT x = 5;
417    # Outside the function, TT is a typedef, but inside (starting and ending
418    # with the braces) it's a parameter.  The trouble begins with yacc's
419    # lookahead token.  If we open a new scope in brace_open, then TT has
420    # already been read and incorrectly interpreted as TYPEID.  So, we need
421    # to open and close scopes from within the lexer.
422    # Similar for the TT immediately outside the end of the function.
423    #
424    @TOKEN(r'\{')
425    def t_LBRACE(self, t):
426        self.on_lbrace_func()
427        return t
428    @TOKEN(r'\}')
429    def t_RBRACE(self, t):
430        self.on_rbrace_func()
431        return t
432
433    t_STRING_LITERAL = string_literal
434
435    # The following floating and integer constants are defined as
436    # functions to impose a strict order (otherwise, decimal
437    # is placed before the others because its regex is longer,
438    # and this is bad)
439    #
440    @TOKEN(floating_constant)
441    def t_FLOAT_CONST(self, t):
442        return t
443
444    @TOKEN(hex_floating_constant)
445    def t_HEX_FLOAT_CONST(self, t):
446        return t
447
448    @TOKEN(hex_constant)
449    def t_INT_CONST_HEX(self, t):
450        return t
451
452    @TOKEN(bin_constant)
453    def t_INT_CONST_BIN(self, t):
454        return t
455
456    @TOKEN(bad_octal_constant)
457    def t_BAD_CONST_OCT(self, t):
458        msg = "Invalid octal constant"
459        self._error(msg, t)
460
461    @TOKEN(octal_constant)
462    def t_INT_CONST_OCT(self, t):
463        return t
464
465    @TOKEN(decimal_constant)
466    def t_INT_CONST_DEC(self, t):
467        return t
468
469    # Must come before bad_char_const, to prevent it from
470    # catching valid char constants as invalid
471    #
472    @TOKEN(multicharacter_constant)
473    def t_INT_CONST_CHAR(self, t):
474        return t
475
476    @TOKEN(char_const)
477    def t_CHAR_CONST(self, t):
478        return t
479
480    @TOKEN(wchar_const)
481    def t_WCHAR_CONST(self, t):
482        return t
483
484    @TOKEN(unmatched_quote)
485    def t_UNMATCHED_QUOTE(self, t):
486        msg = "Unmatched '"
487        self._error(msg, t)
488
489    @TOKEN(bad_char_const)
490    def t_BAD_CHAR_CONST(self, t):
491        msg = "Invalid char constant %s" % t.value
492        self._error(msg, t)
493
494    @TOKEN(wstring_literal)
495    def t_WSTRING_LITERAL(self, t):
496        return t
497
498    # unmatched string literals are caught by the preprocessor
499
500    @TOKEN(bad_string_literal)
501    def t_BAD_STRING_LITERAL(self, t):
502        msg = "String contains invalid escape code"
503        self._error(msg, t)
504
505    @TOKEN(identifier)
506    def t_ID(self, t):
507        t.type = self.keyword_map.get(t.value, "ID")
508        if t.type == 'ID' and self.type_lookup_func(t.value):
509            t.type = "TYPEID"
510        return t
511
512    def t_error(self, t):
513        msg = 'Illegal character %s' % repr(t.value[0])
514        self._error(msg, t)
515