1from collections import Mapping 2import re 3import sys 4 5# Reason last stmt is continued (or C_NONE if it's not). 6(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, 7 C_STRING_NEXT_LINES, C_BRACKET) = range(5) 8 9if 0: # for throwaway debugging output 10 def dump(*stuff): 11 sys.__stdout__.write(" ".join(map(str, stuff)) + "\n") 12 13# Find what looks like the start of a popular stmt. 14 15_synchre = re.compile(r""" 16 ^ 17 [ \t]* 18 (?: while 19 | else 20 | def 21 | return 22 | assert 23 | break 24 | class 25 | continue 26 | elif 27 | try 28 | except 29 | raise 30 | import 31 | yield 32 ) 33 \b 34""", re.VERBOSE | re.MULTILINE).search 35 36# Match blank line or non-indenting comment line. 37 38_junkre = re.compile(r""" 39 [ \t]* 40 (?: \# \S .* )? 41 \n 42""", re.VERBOSE).match 43 44# Match any flavor of string; the terminating quote is optional 45# so that we're robust in the face of incomplete program text. 46 47_match_stringre = re.compile(r""" 48 \""" [^"\\]* (?: 49 (?: \\. | "(?!"") ) 50 [^"\\]* 51 )* 52 (?: \""" )? 53 54| " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 55 56| ''' [^'\\]* (?: 57 (?: \\. | '(?!'') ) 58 [^'\\]* 59 )* 60 (?: ''' )? 61 62| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 63""", re.VERBOSE | re.DOTALL).match 64 65# Match a line that starts with something interesting; 66# used to find the first item of a bracket structure. 67 68_itemre = re.compile(r""" 69 [ \t]* 70 [^\s#\\] # if we match, m.end()-1 is the interesting char 71""", re.VERBOSE).match 72 73# Match start of stmts that should be followed by a dedent. 74 75_closere = re.compile(r""" 76 \s* 77 (?: return 78 | break 79 | continue 80 | raise 81 | pass 82 ) 83 \b 84""", re.VERBOSE).match 85 86# Chew up non-special chars as quickly as possible. If match is 87# successful, m.end() less 1 is the index of the last boring char 88# matched. If match is unsuccessful, the string starts with an 89# interesting char. 90 91_chew_ordinaryre = re.compile(r""" 92 [^[\](){}#'"\\]+ 93""", re.VERBOSE).match 94 95 96class StringTranslatePseudoMapping(Mapping): 97 r"""Utility class to be used with str.translate() 98 99 This Mapping class wraps a given dict. When a value for a key is 100 requested via __getitem__() or get(), the key is looked up in the 101 given dict. If found there, the value from the dict is returned. 102 Otherwise, the default value given upon initialization is returned. 103 104 This allows using str.translate() to make some replacements, and to 105 replace all characters for which no replacement was specified with 106 a given character instead of leaving them as-is. 107 108 For example, to replace everything except whitespace with 'x': 109 110 >>> whitespace_chars = ' \t\n\r' 111 >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars} 112 >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x')) 113 >>> text = "a + b\tc\nd" 114 >>> text.translate(mapping) 115 'x x x\tx\nx' 116 """ 117 def __init__(self, non_defaults, default_value): 118 self._non_defaults = non_defaults 119 self._default_value = default_value 120 121 def _get(key, _get=non_defaults.get, _default=default_value): 122 return _get(key, _default) 123 self._get = _get 124 125 def __getitem__(self, item): 126 return self._get(item) 127 128 def __len__(self): 129 return len(self._non_defaults) 130 131 def __iter__(self): 132 return iter(self._non_defaults) 133 134 def get(self, key, default=None): 135 return self._get(key) 136 137 138class Parser: 139 140 def __init__(self, indentwidth, tabwidth): 141 self.indentwidth = indentwidth 142 self.tabwidth = tabwidth 143 144 def set_str(self, s): 145 assert len(s) == 0 or s[-1] == '\n' 146 self.str = s 147 self.study_level = 0 148 149 # Return index of a good place to begin parsing, as close to the 150 # end of the string as possible. This will be the start of some 151 # popular stmt like "if" or "def". Return None if none found: 152 # the caller should pass more prior context then, if possible, or 153 # if not (the entire program text up until the point of interest 154 # has already been tried) pass 0 to set_lo. 155 # 156 # This will be reliable iff given a reliable is_char_in_string 157 # function, meaning that when it says "no", it's absolutely 158 # guaranteed that the char is not in a string. 159 160 def find_good_parse_start(self, is_char_in_string=None, 161 _synchre=_synchre): 162 str, pos = self.str, None 163 164 if not is_char_in_string: 165 # no clue -- make the caller pass everything 166 return None 167 168 # Peek back from the end for a good place to start, 169 # but don't try too often; pos will be left None, or 170 # bumped to a legitimate synch point. 171 limit = len(str) 172 for tries in range(5): 173 i = str.rfind(":\n", 0, limit) 174 if i < 0: 175 break 176 i = str.rfind('\n', 0, i) + 1 # start of colon line 177 m = _synchre(str, i, limit) 178 if m and not is_char_in_string(m.start()): 179 pos = m.start() 180 break 181 limit = i 182 if pos is None: 183 # Nothing looks like a block-opener, or stuff does 184 # but is_char_in_string keeps returning true; most likely 185 # we're in or near a giant string, the colorizer hasn't 186 # caught up enough to be helpful, or there simply *aren't* 187 # any interesting stmts. In any of these cases we're 188 # going to have to parse the whole thing to be sure, so 189 # give it one last try from the start, but stop wasting 190 # time here regardless of the outcome. 191 m = _synchre(str) 192 if m and not is_char_in_string(m.start()): 193 pos = m.start() 194 return pos 195 196 # Peeking back worked; look forward until _synchre no longer 197 # matches. 198 i = pos + 1 199 while 1: 200 m = _synchre(str, i) 201 if m: 202 s, i = m.span() 203 if not is_char_in_string(s): 204 pos = s 205 else: 206 break 207 return pos 208 209 # Throw away the start of the string. Intended to be called with 210 # find_good_parse_start's result. 211 212 def set_lo(self, lo): 213 assert lo == 0 or self.str[lo-1] == '\n' 214 if lo > 0: 215 self.str = self.str[lo:] 216 217 # Build a translation table to map uninteresting chars to 'x', open 218 # brackets to '(', close brackets to ')' while preserving quotes, 219 # backslashes, newlines and hashes. This is to be passed to 220 # str.translate() in _study1(). 221 _tran = {} 222 _tran.update((ord(c), ord('(')) for c in "({[") 223 _tran.update((ord(c), ord(')')) for c in ")}]") 224 _tran.update((ord(c), ord(c)) for c in "\"'\\\n#") 225 _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x')) 226 227 # As quickly as humanly possible <wink>, find the line numbers (0- 228 # based) of the non-continuation lines. 229 # Creates self.{goodlines, continuation}. 230 231 def _study1(self): 232 if self.study_level >= 1: 233 return 234 self.study_level = 1 235 236 # Map all uninteresting characters to "x", all open brackets 237 # to "(", all close brackets to ")", then collapse runs of 238 # uninteresting characters. This can cut the number of chars 239 # by a factor of 10-40, and so greatly speed the following loop. 240 str = self.str 241 str = str.translate(self._tran) 242 str = str.replace('xxxxxxxx', 'x') 243 str = str.replace('xxxx', 'x') 244 str = str.replace('xx', 'x') 245 str = str.replace('xx', 'x') 246 str = str.replace('\nx', '\n') 247 # note that replacing x\n with \n would be incorrect, because 248 # x may be preceded by a backslash 249 250 # March over the squashed version of the program, accumulating 251 # the line numbers of non-continued stmts, and determining 252 # whether & why the last stmt is a continuation. 253 continuation = C_NONE 254 level = lno = 0 # level is nesting level; lno is line number 255 self.goodlines = goodlines = [0] 256 push_good = goodlines.append 257 i, n = 0, len(str) 258 while i < n: 259 ch = str[i] 260 i = i+1 261 262 # cases are checked in decreasing order of frequency 263 if ch == 'x': 264 continue 265 266 if ch == '\n': 267 lno = lno + 1 268 if level == 0: 269 push_good(lno) 270 # else we're in an unclosed bracket structure 271 continue 272 273 if ch == '(': 274 level = level + 1 275 continue 276 277 if ch == ')': 278 if level: 279 level = level - 1 280 # else the program is invalid, but we can't complain 281 continue 282 283 if ch == '"' or ch == "'": 284 # consume the string 285 quote = ch 286 if str[i-1:i+2] == quote * 3: 287 quote = quote * 3 288 firstlno = lno 289 w = len(quote) - 1 290 i = i+w 291 while i < n: 292 ch = str[i] 293 i = i+1 294 295 if ch == 'x': 296 continue 297 298 if str[i-1:i+w] == quote: 299 i = i+w 300 break 301 302 if ch == '\n': 303 lno = lno + 1 304 if w == 0: 305 # unterminated single-quoted string 306 if level == 0: 307 push_good(lno) 308 break 309 continue 310 311 if ch == '\\': 312 assert i < n 313 if str[i] == '\n': 314 lno = lno + 1 315 i = i+1 316 continue 317 318 # else comment char or paren inside string 319 320 else: 321 # didn't break out of the loop, so we're still 322 # inside a string 323 if (lno - 1) == firstlno: 324 # before the previous \n in str, we were in the first 325 # line of the string 326 continuation = C_STRING_FIRST_LINE 327 else: 328 continuation = C_STRING_NEXT_LINES 329 continue # with outer loop 330 331 if ch == '#': 332 # consume the comment 333 i = str.find('\n', i) 334 assert i >= 0 335 continue 336 337 assert ch == '\\' 338 assert i < n 339 if str[i] == '\n': 340 lno = lno + 1 341 if i+1 == n: 342 continuation = C_BACKSLASH 343 i = i+1 344 345 # The last stmt may be continued for all 3 reasons. 346 # String continuation takes precedence over bracket 347 # continuation, which beats backslash continuation. 348 if (continuation != C_STRING_FIRST_LINE 349 and continuation != C_STRING_NEXT_LINES and level > 0): 350 continuation = C_BRACKET 351 self.continuation = continuation 352 353 # Push the final line number as a sentinel value, regardless of 354 # whether it's continued. 355 assert (continuation == C_NONE) == (goodlines[-1] == lno) 356 if goodlines[-1] != lno: 357 push_good(lno) 358 359 def get_continuation_type(self): 360 self._study1() 361 return self.continuation 362 363 # study1 was sufficient to determine the continuation status, 364 # but doing more requires looking at every character. study2 365 # does this for the last interesting statement in the block. 366 # Creates: 367 # self.stmt_start, stmt_end 368 # slice indices of last interesting stmt 369 # self.stmt_bracketing 370 # the bracketing structure of the last interesting stmt; 371 # for example, for the statement "say(boo) or die", stmt_bracketing 372 # will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are 373 # treated as brackets, for the matter. 374 # self.lastch 375 # last non-whitespace character before optional trailing 376 # comment 377 # self.lastopenbracketpos 378 # if continuation is C_BRACKET, index of last open bracket 379 380 def _study2(self): 381 if self.study_level >= 2: 382 return 383 self._study1() 384 self.study_level = 2 385 386 # Set p and q to slice indices of last interesting stmt. 387 str, goodlines = self.str, self.goodlines 388 i = len(goodlines) - 1 389 p = len(str) # index of newest line 390 while i: 391 assert p 392 # p is the index of the stmt at line number goodlines[i]. 393 # Move p back to the stmt at line number goodlines[i-1]. 394 q = p 395 for nothing in range(goodlines[i-1], goodlines[i]): 396 # tricky: sets p to 0 if no preceding newline 397 p = str.rfind('\n', 0, p-1) + 1 398 # The stmt str[p:q] isn't a continuation, but may be blank 399 # or a non-indenting comment line. 400 if _junkre(str, p): 401 i = i-1 402 else: 403 break 404 if i == 0: 405 # nothing but junk! 406 assert p == 0 407 q = p 408 self.stmt_start, self.stmt_end = p, q 409 410 # Analyze this stmt, to find the last open bracket (if any) 411 # and last interesting character (if any). 412 lastch = "" 413 stack = [] # stack of open bracket indices 414 push_stack = stack.append 415 bracketing = [(p, 0)] 416 while p < q: 417 # suck up all except ()[]{}'"#\\ 418 m = _chew_ordinaryre(str, p, q) 419 if m: 420 # we skipped at least one boring char 421 newp = m.end() 422 # back up over totally boring whitespace 423 i = newp - 1 # index of last boring char 424 while i >= p and str[i] in " \t\n": 425 i = i-1 426 if i >= p: 427 lastch = str[i] 428 p = newp 429 if p >= q: 430 break 431 432 ch = str[p] 433 434 if ch in "([{": 435 push_stack(p) 436 bracketing.append((p, len(stack))) 437 lastch = ch 438 p = p+1 439 continue 440 441 if ch in ")]}": 442 if stack: 443 del stack[-1] 444 lastch = ch 445 p = p+1 446 bracketing.append((p, len(stack))) 447 continue 448 449 if ch == '"' or ch == "'": 450 # consume string 451 # Note that study1 did this with a Python loop, but 452 # we use a regexp here; the reason is speed in both 453 # cases; the string may be huge, but study1 pre-squashed 454 # strings to a couple of characters per line. study1 455 # also needed to keep track of newlines, and we don't 456 # have to. 457 bracketing.append((p, len(stack)+1)) 458 lastch = ch 459 p = _match_stringre(str, p, q).end() 460 bracketing.append((p, len(stack))) 461 continue 462 463 if ch == '#': 464 # consume comment and trailing newline 465 bracketing.append((p, len(stack)+1)) 466 p = str.find('\n', p, q) + 1 467 assert p > 0 468 bracketing.append((p, len(stack))) 469 continue 470 471 assert ch == '\\' 472 p = p+1 # beyond backslash 473 assert p < q 474 if str[p] != '\n': 475 # the program is invalid, but can't complain 476 lastch = ch + str[p] 477 p = p+1 # beyond escaped char 478 479 # end while p < q: 480 481 self.lastch = lastch 482 if stack: 483 self.lastopenbracketpos = stack[-1] 484 self.stmt_bracketing = tuple(bracketing) 485 486 # Assuming continuation is C_BRACKET, return the number 487 # of spaces the next line should be indented. 488 489 def compute_bracket_indent(self): 490 self._study2() 491 assert self.continuation == C_BRACKET 492 j = self.lastopenbracketpos 493 str = self.str 494 n = len(str) 495 origi = i = str.rfind('\n', 0, j) + 1 496 j = j+1 # one beyond open bracket 497 # find first list item; set i to start of its line 498 while j < n: 499 m = _itemre(str, j) 500 if m: 501 j = m.end() - 1 # index of first interesting char 502 extra = 0 503 break 504 else: 505 # this line is junk; advance to next line 506 i = j = str.find('\n', j) + 1 507 else: 508 # nothing interesting follows the bracket; 509 # reproduce the bracket line's indentation + a level 510 j = i = origi 511 while str[j] in " \t": 512 j = j+1 513 extra = self.indentwidth 514 return len(str[i:j].expandtabs(self.tabwidth)) + extra 515 516 # Return number of physical lines in last stmt (whether or not 517 # it's an interesting stmt! this is intended to be called when 518 # continuation is C_BACKSLASH). 519 520 def get_num_lines_in_stmt(self): 521 self._study1() 522 goodlines = self.goodlines 523 return goodlines[-1] - goodlines[-2] 524 525 # Assuming continuation is C_BACKSLASH, return the number of spaces 526 # the next line should be indented. Also assuming the new line is 527 # the first one following the initial line of the stmt. 528 529 def compute_backslash_indent(self): 530 self._study2() 531 assert self.continuation == C_BACKSLASH 532 str = self.str 533 i = self.stmt_start 534 while str[i] in " \t": 535 i = i+1 536 startpos = i 537 538 # See whether the initial line starts an assignment stmt; i.e., 539 # look for an = operator 540 endpos = str.find('\n', startpos) + 1 541 found = level = 0 542 while i < endpos: 543 ch = str[i] 544 if ch in "([{": 545 level = level + 1 546 i = i+1 547 elif ch in ")]}": 548 if level: 549 level = level - 1 550 i = i+1 551 elif ch == '"' or ch == "'": 552 i = _match_stringre(str, i, endpos).end() 553 elif ch == '#': 554 break 555 elif level == 0 and ch == '=' and \ 556 (i == 0 or str[i-1] not in "=<>!") and \ 557 str[i+1] != '=': 558 found = 1 559 break 560 else: 561 i = i+1 562 563 if found: 564 # found a legit =, but it may be the last interesting 565 # thing on the line 566 i = i+1 # move beyond the = 567 found = re.match(r"\s*\\", str[i:endpos]) is None 568 569 if not found: 570 # oh well ... settle for moving beyond the first chunk 571 # of non-whitespace chars 572 i = startpos 573 while str[i] not in " \t\n": 574 i = i+1 575 576 return len(str[self.stmt_start:i].expandtabs(\ 577 self.tabwidth)) + 1 578 579 # Return the leading whitespace on the initial line of the last 580 # interesting stmt. 581 582 def get_base_indent_string(self): 583 self._study2() 584 i, n = self.stmt_start, self.stmt_end 585 j = i 586 str = self.str 587 while j < n and str[j] in " \t": 588 j = j + 1 589 return str[i:j] 590 591 # Did the last interesting stmt open a block? 592 593 def is_block_opener(self): 594 self._study2() 595 return self.lastch == ':' 596 597 # Did the last interesting stmt close a block? 598 599 def is_block_closer(self): 600 self._study2() 601 return _closere(self.str, self.stmt_start) is not None 602 603 # index of last open bracket ({[, or None if none 604 lastopenbracketpos = None 605 606 def get_last_open_bracket_pos(self): 607 self._study2() 608 return self.lastopenbracketpos 609 610 # the structure of the bracketing of the last interesting statement, 611 # in the format defined in _study2, or None if the text didn't contain 612 # anything 613 stmt_bracketing = None 614 615 def get_last_stmt_bracketing(self): 616 self._study2() 617 return self.stmt_bracketing 618