1#------------------------------------------------------------------------------ 2# pycparser: c_lexer.py 3# 4# CLexer class: lexer for the C language 5# 6# Eli Bendersky [https://eli.thegreenplace.net/] 7# License: BSD 8#------------------------------------------------------------------------------ 9import re 10import sys 11 12from .ply import lex 13from .ply.lex import TOKEN 14 15 16class CLexer(object): 17 """ A lexer for the C language. After building it, set the 18 input text with input(), and call token() to get new 19 tokens. 20 21 The public attribute filename can be set to an initial 22 filename, but the lexer will update it upon #line 23 directives. 24 """ 25 def __init__(self, error_func, on_lbrace_func, on_rbrace_func, 26 type_lookup_func): 27 """ Create a new Lexer. 28 29 error_func: 30 An error function. Will be called with an error 31 message, line and column as arguments, in case of 32 an error during lexing. 33 34 on_lbrace_func, on_rbrace_func: 35 Called when an LBRACE or RBRACE is encountered 36 (likely to push/pop type_lookup_func's scope) 37 38 type_lookup_func: 39 A type lookup function. Given a string, it must 40 return True IFF this string is a name of a type 41 that was defined with a typedef earlier. 42 """ 43 self.error_func = error_func 44 self.on_lbrace_func = on_lbrace_func 45 self.on_rbrace_func = on_rbrace_func 46 self.type_lookup_func = type_lookup_func 47 self.filename = '' 48 49 # Keeps track of the last token returned from self.token() 50 self.last_token = None 51 52 # Allow either "# line" or "# <num>" to support GCC's 53 # cpp output 54 # 55 self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)') 56 self.pragma_pattern = re.compile(r'[ \t]*pragma\W') 57 58 def build(self, **kwargs): 59 """ Builds the lexer from the specification. Must be 60 called after the lexer object is created. 61 62 This method exists separately, because the PLY 63 manual warns against calling lex.lex inside 64 __init__ 65 """ 66 self.lexer = lex.lex(object=self, **kwargs) 67 68 def reset_lineno(self): 69 """ Resets the internal line number counter of the lexer. 70 """ 71 self.lexer.lineno = 1 72 73 def input(self, text): 74 self.lexer.input(text) 75 76 def token(self): 77 self.last_token = self.lexer.token() 78 return self.last_token 79 80 def find_tok_column(self, token): 81 """ Find the column of the token in its line. 82 """ 83 last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) 84 return token.lexpos - last_cr 85 86 ######################-- PRIVATE --###################### 87 88 ## 89 ## Internal auxiliary methods 90 ## 91 def _error(self, msg, token): 92 location = self._make_tok_location(token) 93 self.error_func(msg, location[0], location[1]) 94 self.lexer.skip(1) 95 96 def _make_tok_location(self, token): 97 return (token.lineno, self.find_tok_column(token)) 98 99 ## 100 ## Reserved keywords 101 ## 102 keywords = ( 103 '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 104 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', 105 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 106 'REGISTER', 'OFFSETOF', 107 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 108 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', 109 'VOLATILE', 'WHILE', '__INT128', 110 ) 111 112 keyword_map = {} 113 for keyword in keywords: 114 if keyword == '_BOOL': 115 keyword_map['_Bool'] = keyword 116 elif keyword == '_COMPLEX': 117 keyword_map['_Complex'] = keyword 118 else: 119 keyword_map[keyword.lower()] = keyword 120 121 ## 122 ## All the tokens recognized by the lexer 123 ## 124 tokens = keywords + ( 125 # Identifiers 126 'ID', 127 128 # Type identifiers (identifiers previously defined as 129 # types with typedef) 130 'TYPEID', 131 132 # constants 133 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', 134 'FLOAT_CONST', 'HEX_FLOAT_CONST', 135 'CHAR_CONST', 136 'WCHAR_CONST', 137 138 # String literals 139 'STRING_LITERAL', 140 'WSTRING_LITERAL', 141 142 # Operators 143 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 144 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 145 'LOR', 'LAND', 'LNOT', 146 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', 147 148 # Assignment 149 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 150 'PLUSEQUAL', 'MINUSEQUAL', 151 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 152 'OREQUAL', 153 154 # Increment/decrement 155 'PLUSPLUS', 'MINUSMINUS', 156 157 # Structure dereference (->) 158 'ARROW', 159 160 # Conditional operator (?) 161 'CONDOP', 162 163 # Delimeters 164 'LPAREN', 'RPAREN', # ( ) 165 'LBRACKET', 'RBRACKET', # [ ] 166 'LBRACE', 'RBRACE', # { } 167 'COMMA', 'PERIOD', # . , 168 'SEMI', 'COLON', # ; : 169 170 # Ellipsis (...) 171 'ELLIPSIS', 172 173 # pre-processor 174 'PPHASH', # '#' 175 'PPPRAGMA', # 'pragma' 176 'PPPRAGMASTR', 177 ) 178 179 ## 180 ## Regexes for use in tokens 181 ## 182 ## 183 184 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 185 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' 186 187 hex_prefix = '0[xX]' 188 hex_digits = '[0-9a-fA-F]+' 189 bin_prefix = '0[bB]' 190 bin_digits = '[01]+' 191 192 # integer constants (K&R2: A.2.5.1) 193 integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' 194 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' 195 octal_constant = '0[0-7]*'+integer_suffix_opt 196 hex_constant = hex_prefix+hex_digits+integer_suffix_opt 197 bin_constant = bin_prefix+bin_digits+integer_suffix_opt 198 199 bad_octal_constant = '0[0-7]*[89]' 200 201 # character constants (K&R2: A.2.5.2) 202 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 203 # directives with Windows paths as filenames (..\..\dir\file) 204 # For the same reason, decimal_escape allows all digit sequences. We want to 205 # parse all correct code, even if it means to sometimes parse incorrect 206 # code. 207 # 208 # The original regexes were taken verbatim from the C syntax definition, 209 # and were later modified to avoid worst-case exponential running time. 210 # 211 # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 212 # decimal_escape = r"""(\d+)""" 213 # hex_escape = r"""(x[0-9a-fA-F]+)""" 214 # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 215 # 216 # The following modifications were made to avoid the ambiguity that allowed backtracking: 217 # (https://github.com/eliben/pycparser/issues/61) 218 # 219 # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. 220 # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex 221 # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal 222 # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. 223 # 224 # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. 225 # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. 226 227 simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 228 decimal_escape = r"""(\d+)(?!\d)""" 229 hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 230 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 231 232 escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 233 234 # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed 235 # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to 236 237 escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 238 239 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' 240 char_const = "'"+cconst_char+"'" 241 wchar_const = 'L'+char_const 242 multicharacter_constant = "'"+cconst_char+"{2,4}'" 243 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" 244 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" 245 246 # string literals (K&R2: A.2.6) 247 string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' 248 string_literal = '"'+string_char+'*"' 249 wstring_literal = 'L'+string_literal 250 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 251 252 # floating constants (K&R2: A.2.5.3) 253 exponent_part = r"""([eE][-+]?[0-9]+)""" 254 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 255 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' 256 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' 257 hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" 258 hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' 259 260 ## 261 ## Lexer states: used for preprocessor \n-terminated directives 262 ## 263 states = ( 264 # ppline: preprocessor line directives 265 # 266 ('ppline', 'exclusive'), 267 268 # pppragma: pragma 269 # 270 ('pppragma', 'exclusive'), 271 ) 272 273 def t_PPHASH(self, t): 274 r'[ \t]*\#' 275 if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 276 t.lexer.begin('ppline') 277 self.pp_line = self.pp_filename = None 278 elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 279 t.lexer.begin('pppragma') 280 else: 281 t.type = 'PPHASH' 282 return t 283 284 ## 285 ## Rules for the ppline state 286 ## 287 @TOKEN(string_literal) 288 def t_ppline_FILENAME(self, t): 289 if self.pp_line is None: 290 self._error('filename before line number in #line', t) 291 else: 292 self.pp_filename = t.value.lstrip('"').rstrip('"') 293 294 @TOKEN(decimal_constant) 295 def t_ppline_LINE_NUMBER(self, t): 296 if self.pp_line is None: 297 self.pp_line = t.value 298 else: 299 # Ignore: GCC's cpp sometimes inserts a numeric flag 300 # after the file name 301 pass 302 303 def t_ppline_NEWLINE(self, t): 304 r'\n' 305 if self.pp_line is None: 306 self._error('line number missing in #line', t) 307 else: 308 self.lexer.lineno = int(self.pp_line) 309 310 if self.pp_filename is not None: 311 self.filename = self.pp_filename 312 313 t.lexer.begin('INITIAL') 314 315 def t_ppline_PPLINE(self, t): 316 r'line' 317 pass 318 319 t_ppline_ignore = ' \t' 320 321 def t_ppline_error(self, t): 322 self._error('invalid #line directive', t) 323 324 ## 325 ## Rules for the pppragma state 326 ## 327 def t_pppragma_NEWLINE(self, t): 328 r'\n' 329 t.lexer.lineno += 1 330 t.lexer.begin('INITIAL') 331 332 def t_pppragma_PPPRAGMA(self, t): 333 r'pragma' 334 return t 335 336 t_pppragma_ignore = ' \t' 337 338 def t_pppragma_STR(self, t): 339 '.+' 340 t.type = 'PPPRAGMASTR' 341 return t 342 343 def t_pppragma_error(self, t): 344 self._error('invalid #pragma directive', t) 345 346 ## 347 ## Rules for the normal state 348 ## 349 t_ignore = ' \t' 350 351 # Newlines 352 def t_NEWLINE(self, t): 353 r'\n+' 354 t.lexer.lineno += t.value.count("\n") 355 356 # Operators 357 t_PLUS = r'\+' 358 t_MINUS = r'-' 359 t_TIMES = r'\*' 360 t_DIVIDE = r'/' 361 t_MOD = r'%' 362 t_OR = r'\|' 363 t_AND = r'&' 364 t_NOT = r'~' 365 t_XOR = r'\^' 366 t_LSHIFT = r'<<' 367 t_RSHIFT = r'>>' 368 t_LOR = r'\|\|' 369 t_LAND = r'&&' 370 t_LNOT = r'!' 371 t_LT = r'<' 372 t_GT = r'>' 373 t_LE = r'<=' 374 t_GE = r'>=' 375 t_EQ = r'==' 376 t_NE = r'!=' 377 378 # Assignment operators 379 t_EQUALS = r'=' 380 t_TIMESEQUAL = r'\*=' 381 t_DIVEQUAL = r'/=' 382 t_MODEQUAL = r'%=' 383 t_PLUSEQUAL = r'\+=' 384 t_MINUSEQUAL = r'-=' 385 t_LSHIFTEQUAL = r'<<=' 386 t_RSHIFTEQUAL = r'>>=' 387 t_ANDEQUAL = r'&=' 388 t_OREQUAL = r'\|=' 389 t_XOREQUAL = r'\^=' 390 391 # Increment/decrement 392 t_PLUSPLUS = r'\+\+' 393 t_MINUSMINUS = r'--' 394 395 # -> 396 t_ARROW = r'->' 397 398 # ? 399 t_CONDOP = r'\?' 400 401 # Delimeters 402 t_LPAREN = r'\(' 403 t_RPAREN = r'\)' 404 t_LBRACKET = r'\[' 405 t_RBRACKET = r'\]' 406 t_COMMA = r',' 407 t_PERIOD = r'\.' 408 t_SEMI = r';' 409 t_COLON = r':' 410 t_ELLIPSIS = r'\.\.\.' 411 412 # Scope delimiters 413 # To see why on_lbrace_func is needed, consider: 414 # typedef char TT; 415 # void foo(int TT) { TT = 10; } 416 # TT x = 5; 417 # Outside the function, TT is a typedef, but inside (starting and ending 418 # with the braces) it's a parameter. The trouble begins with yacc's 419 # lookahead token. If we open a new scope in brace_open, then TT has 420 # already been read and incorrectly interpreted as TYPEID. So, we need 421 # to open and close scopes from within the lexer. 422 # Similar for the TT immediately outside the end of the function. 423 # 424 @TOKEN(r'\{') 425 def t_LBRACE(self, t): 426 self.on_lbrace_func() 427 return t 428 @TOKEN(r'\}') 429 def t_RBRACE(self, t): 430 self.on_rbrace_func() 431 return t 432 433 t_STRING_LITERAL = string_literal 434 435 # The following floating and integer constants are defined as 436 # functions to impose a strict order (otherwise, decimal 437 # is placed before the others because its regex is longer, 438 # and this is bad) 439 # 440 @TOKEN(floating_constant) 441 def t_FLOAT_CONST(self, t): 442 return t 443 444 @TOKEN(hex_floating_constant) 445 def t_HEX_FLOAT_CONST(self, t): 446 return t 447 448 @TOKEN(hex_constant) 449 def t_INT_CONST_HEX(self, t): 450 return t 451 452 @TOKEN(bin_constant) 453 def t_INT_CONST_BIN(self, t): 454 return t 455 456 @TOKEN(bad_octal_constant) 457 def t_BAD_CONST_OCT(self, t): 458 msg = "Invalid octal constant" 459 self._error(msg, t) 460 461 @TOKEN(octal_constant) 462 def t_INT_CONST_OCT(self, t): 463 return t 464 465 @TOKEN(decimal_constant) 466 def t_INT_CONST_DEC(self, t): 467 return t 468 469 # Must come before bad_char_const, to prevent it from 470 # catching valid char constants as invalid 471 # 472 @TOKEN(multicharacter_constant) 473 def t_INT_CONST_CHAR(self, t): 474 return t 475 476 @TOKEN(char_const) 477 def t_CHAR_CONST(self, t): 478 return t 479 480 @TOKEN(wchar_const) 481 def t_WCHAR_CONST(self, t): 482 return t 483 484 @TOKEN(unmatched_quote) 485 def t_UNMATCHED_QUOTE(self, t): 486 msg = "Unmatched '" 487 self._error(msg, t) 488 489 @TOKEN(bad_char_const) 490 def t_BAD_CHAR_CONST(self, t): 491 msg = "Invalid char constant %s" % t.value 492 self._error(msg, t) 493 494 @TOKEN(wstring_literal) 495 def t_WSTRING_LITERAL(self, t): 496 return t 497 498 # unmatched string literals are caught by the preprocessor 499 500 @TOKEN(bad_string_literal) 501 def t_BAD_STRING_LITERAL(self, t): 502 msg = "String contains invalid escape code" 503 self._error(msg, t) 504 505 @TOKEN(identifier) 506 def t_ID(self, t): 507 t.type = self.keyword_map.get(t.value, "ID") 508 if t.type == 'ID' and self.type_lookup_func(t.value): 509 t.type = "TYPEID" 510 return t 511 512 def t_error(self, t): 513 msg = 'Illegal character %s' % repr(t.value[0]) 514 self._error(msg, t) 515