1from collections import Mapping
2import re
3import sys
4
5# Reason last stmt is continued (or C_NONE if it's not).
6(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
7 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
8
9if 0:   # for throwaway debugging output
10    def dump(*stuff):
11        sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
12
13# Find what looks like the start of a popular stmt.
14
15_synchre = re.compile(r"""
16    ^
17    [ \t]*
18    (?: while
19    |   else
20    |   def
21    |   return
22    |   assert
23    |   break
24    |   class
25    |   continue
26    |   elif
27    |   try
28    |   except
29    |   raise
30    |   import
31    |   yield
32    )
33    \b
34""", re.VERBOSE | re.MULTILINE).search
35
36# Match blank line or non-indenting comment line.
37
38_junkre = re.compile(r"""
39    [ \t]*
40    (?: \# \S .* )?
41    \n
42""", re.VERBOSE).match
43
44# Match any flavor of string; the terminating quote is optional
45# so that we're robust in the face of incomplete program text.
46
47_match_stringre = re.compile(r"""
48    \""" [^"\\]* (?:
49                     (?: \\. | "(?!"") )
50                     [^"\\]*
51                 )*
52    (?: \""" )?
53
54|   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
55
56|   ''' [^'\\]* (?:
57                   (?: \\. | '(?!'') )
58                   [^'\\]*
59                )*
60    (?: ''' )?
61
62|   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
63""", re.VERBOSE | re.DOTALL).match
64
65# Match a line that starts with something interesting;
66# used to find the first item of a bracket structure.
67
68_itemre = re.compile(r"""
69    [ \t]*
70    [^\s#\\]    # if we match, m.end()-1 is the interesting char
71""", re.VERBOSE).match
72
73# Match start of stmts that should be followed by a dedent.
74
75_closere = re.compile(r"""
76    \s*
77    (?: return
78    |   break
79    |   continue
80    |   raise
81    |   pass
82    )
83    \b
84""", re.VERBOSE).match
85
86# Chew up non-special chars as quickly as possible.  If match is
87# successful, m.end() less 1 is the index of the last boring char
88# matched.  If match is unsuccessful, the string starts with an
89# interesting char.
90
91_chew_ordinaryre = re.compile(r"""
92    [^[\](){}#'"\\]+
93""", re.VERBOSE).match
94
95
96class StringTranslatePseudoMapping(Mapping):
97    r"""Utility class to be used with str.translate()
98
99    This Mapping class wraps a given dict. When a value for a key is
100    requested via __getitem__() or get(), the key is looked up in the
101    given dict. If found there, the value from the dict is returned.
102    Otherwise, the default value given upon initialization is returned.
103
104    This allows using str.translate() to make some replacements, and to
105    replace all characters for which no replacement was specified with
106    a given character instead of leaving them as-is.
107
108    For example, to replace everything except whitespace with 'x':
109
110    >>> whitespace_chars = ' \t\n\r'
111    >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
112    >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
113    >>> text = "a + b\tc\nd"
114    >>> text.translate(mapping)
115    'x x x\tx\nx'
116    """
117    def __init__(self, non_defaults, default_value):
118        self._non_defaults = non_defaults
119        self._default_value = default_value
120
121        def _get(key, _get=non_defaults.get, _default=default_value):
122            return _get(key, _default)
123        self._get = _get
124
125    def __getitem__(self, item):
126        return self._get(item)
127
128    def __len__(self):
129        return len(self._non_defaults)
130
131    def __iter__(self):
132        return iter(self._non_defaults)
133
134    def get(self, key, default=None):
135        return self._get(key)
136
137
138class Parser:
139
140    def __init__(self, indentwidth, tabwidth):
141        self.indentwidth = indentwidth
142        self.tabwidth = tabwidth
143
144    def set_str(self, s):
145        assert len(s) == 0 or s[-1] == '\n'
146        self.str = s
147        self.study_level = 0
148
149    # Return index of a good place to begin parsing, as close to the
150    # end of the string as possible.  This will be the start of some
151    # popular stmt like "if" or "def".  Return None if none found:
152    # the caller should pass more prior context then, if possible, or
153    # if not (the entire program text up until the point of interest
154    # has already been tried) pass 0 to set_lo.
155    #
156    # This will be reliable iff given a reliable is_char_in_string
157    # function, meaning that when it says "no", it's absolutely
158    # guaranteed that the char is not in a string.
159
160    def find_good_parse_start(self, is_char_in_string=None,
161                              _synchre=_synchre):
162        str, pos = self.str, None
163
164        if not is_char_in_string:
165            # no clue -- make the caller pass everything
166            return None
167
168        # Peek back from the end for a good place to start,
169        # but don't try too often; pos will be left None, or
170        # bumped to a legitimate synch point.
171        limit = len(str)
172        for tries in range(5):
173            i = str.rfind(":\n", 0, limit)
174            if i < 0:
175                break
176            i = str.rfind('\n', 0, i) + 1  # start of colon line
177            m = _synchre(str, i, limit)
178            if m and not is_char_in_string(m.start()):
179                pos = m.start()
180                break
181            limit = i
182        if pos is None:
183            # Nothing looks like a block-opener, or stuff does
184            # but is_char_in_string keeps returning true; most likely
185            # we're in or near a giant string, the colorizer hasn't
186            # caught up enough to be helpful, or there simply *aren't*
187            # any interesting stmts.  In any of these cases we're
188            # going to have to parse the whole thing to be sure, so
189            # give it one last try from the start, but stop wasting
190            # time here regardless of the outcome.
191            m = _synchre(str)
192            if m and not is_char_in_string(m.start()):
193                pos = m.start()
194            return pos
195
196        # Peeking back worked; look forward until _synchre no longer
197        # matches.
198        i = pos + 1
199        while 1:
200            m = _synchre(str, i)
201            if m:
202                s, i = m.span()
203                if not is_char_in_string(s):
204                    pos = s
205            else:
206                break
207        return pos
208
209    # Throw away the start of the string.  Intended to be called with
210    # find_good_parse_start's result.
211
212    def set_lo(self, lo):
213        assert lo == 0 or self.str[lo-1] == '\n'
214        if lo > 0:
215            self.str = self.str[lo:]
216
217    # Build a translation table to map uninteresting chars to 'x', open
218    # brackets to '(', close brackets to ')' while preserving quotes,
219    # backslashes, newlines and hashes. This is to be passed to
220    # str.translate() in _study1().
221    _tran = {}
222    _tran.update((ord(c), ord('(')) for c in "({[")
223    _tran.update((ord(c), ord(')')) for c in ")}]")
224    _tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
225    _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
226
227    # As quickly as humanly possible <wink>, find the line numbers (0-
228    # based) of the non-continuation lines.
229    # Creates self.{goodlines, continuation}.
230
231    def _study1(self):
232        if self.study_level >= 1:
233            return
234        self.study_level = 1
235
236        # Map all uninteresting characters to "x", all open brackets
237        # to "(", all close brackets to ")", then collapse runs of
238        # uninteresting characters.  This can cut the number of chars
239        # by a factor of 10-40, and so greatly speed the following loop.
240        str = self.str
241        str = str.translate(self._tran)
242        str = str.replace('xxxxxxxx', 'x')
243        str = str.replace('xxxx', 'x')
244        str = str.replace('xx', 'x')
245        str = str.replace('xx', 'x')
246        str = str.replace('\nx', '\n')
247        # note that replacing x\n with \n would be incorrect, because
248        # x may be preceded by a backslash
249
250        # March over the squashed version of the program, accumulating
251        # the line numbers of non-continued stmts, and determining
252        # whether & why the last stmt is a continuation.
253        continuation = C_NONE
254        level = lno = 0     # level is nesting level; lno is line number
255        self.goodlines = goodlines = [0]
256        push_good = goodlines.append
257        i, n = 0, len(str)
258        while i < n:
259            ch = str[i]
260            i = i+1
261
262            # cases are checked in decreasing order of frequency
263            if ch == 'x':
264                continue
265
266            if ch == '\n':
267                lno = lno + 1
268                if level == 0:
269                    push_good(lno)
270                    # else we're in an unclosed bracket structure
271                continue
272
273            if ch == '(':
274                level = level + 1
275                continue
276
277            if ch == ')':
278                if level:
279                    level = level - 1
280                    # else the program is invalid, but we can't complain
281                continue
282
283            if ch == '"' or ch == "'":
284                # consume the string
285                quote = ch
286                if str[i-1:i+2] == quote * 3:
287                    quote = quote * 3
288                firstlno = lno
289                w = len(quote) - 1
290                i = i+w
291                while i < n:
292                    ch = str[i]
293                    i = i+1
294
295                    if ch == 'x':
296                        continue
297
298                    if str[i-1:i+w] == quote:
299                        i = i+w
300                        break
301
302                    if ch == '\n':
303                        lno = lno + 1
304                        if w == 0:
305                            # unterminated single-quoted string
306                            if level == 0:
307                                push_good(lno)
308                            break
309                        continue
310
311                    if ch == '\\':
312                        assert i < n
313                        if str[i] == '\n':
314                            lno = lno + 1
315                        i = i+1
316                        continue
317
318                    # else comment char or paren inside string
319
320                else:
321                    # didn't break out of the loop, so we're still
322                    # inside a string
323                    if (lno - 1) == firstlno:
324                        # before the previous \n in str, we were in the first
325                        # line of the string
326                        continuation = C_STRING_FIRST_LINE
327                    else:
328                        continuation = C_STRING_NEXT_LINES
329                continue    # with outer loop
330
331            if ch == '#':
332                # consume the comment
333                i = str.find('\n', i)
334                assert i >= 0
335                continue
336
337            assert ch == '\\'
338            assert i < n
339            if str[i] == '\n':
340                lno = lno + 1
341                if i+1 == n:
342                    continuation = C_BACKSLASH
343            i = i+1
344
345        # The last stmt may be continued for all 3 reasons.
346        # String continuation takes precedence over bracket
347        # continuation, which beats backslash continuation.
348        if (continuation != C_STRING_FIRST_LINE
349            and continuation != C_STRING_NEXT_LINES and level > 0):
350            continuation = C_BRACKET
351        self.continuation = continuation
352
353        # Push the final line number as a sentinel value, regardless of
354        # whether it's continued.
355        assert (continuation == C_NONE) == (goodlines[-1] == lno)
356        if goodlines[-1] != lno:
357            push_good(lno)
358
359    def get_continuation_type(self):
360        self._study1()
361        return self.continuation
362
363    # study1 was sufficient to determine the continuation status,
364    # but doing more requires looking at every character.  study2
365    # does this for the last interesting statement in the block.
366    # Creates:
367    #     self.stmt_start, stmt_end
368    #         slice indices of last interesting stmt
369    #     self.stmt_bracketing
370    #         the bracketing structure of the last interesting stmt;
371    #         for example, for the statement "say(boo) or die", stmt_bracketing
372    #         will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
373    #         treated as brackets, for the matter.
374    #     self.lastch
375    #         last non-whitespace character before optional trailing
376    #         comment
377    #     self.lastopenbracketpos
378    #         if continuation is C_BRACKET, index of last open bracket
379
380    def _study2(self):
381        if self.study_level >= 2:
382            return
383        self._study1()
384        self.study_level = 2
385
386        # Set p and q to slice indices of last interesting stmt.
387        str, goodlines = self.str, self.goodlines
388        i = len(goodlines) - 1
389        p = len(str)    # index of newest line
390        while i:
391            assert p
392            # p is the index of the stmt at line number goodlines[i].
393            # Move p back to the stmt at line number goodlines[i-1].
394            q = p
395            for nothing in range(goodlines[i-1], goodlines[i]):
396                # tricky: sets p to 0 if no preceding newline
397                p = str.rfind('\n', 0, p-1) + 1
398            # The stmt str[p:q] isn't a continuation, but may be blank
399            # or a non-indenting comment line.
400            if  _junkre(str, p):
401                i = i-1
402            else:
403                break
404        if i == 0:
405            # nothing but junk!
406            assert p == 0
407            q = p
408        self.stmt_start, self.stmt_end = p, q
409
410        # Analyze this stmt, to find the last open bracket (if any)
411        # and last interesting character (if any).
412        lastch = ""
413        stack = []  # stack of open bracket indices
414        push_stack = stack.append
415        bracketing = [(p, 0)]
416        while p < q:
417            # suck up all except ()[]{}'"#\\
418            m = _chew_ordinaryre(str, p, q)
419            if m:
420                # we skipped at least one boring char
421                newp = m.end()
422                # back up over totally boring whitespace
423                i = newp - 1    # index of last boring char
424                while i >= p and str[i] in " \t\n":
425                    i = i-1
426                if i >= p:
427                    lastch = str[i]
428                p = newp
429                if p >= q:
430                    break
431
432            ch = str[p]
433
434            if ch in "([{":
435                push_stack(p)
436                bracketing.append((p, len(stack)))
437                lastch = ch
438                p = p+1
439                continue
440
441            if ch in ")]}":
442                if stack:
443                    del stack[-1]
444                lastch = ch
445                p = p+1
446                bracketing.append((p, len(stack)))
447                continue
448
449            if ch == '"' or ch == "'":
450                # consume string
451                # Note that study1 did this with a Python loop, but
452                # we use a regexp here; the reason is speed in both
453                # cases; the string may be huge, but study1 pre-squashed
454                # strings to a couple of characters per line.  study1
455                # also needed to keep track of newlines, and we don't
456                # have to.
457                bracketing.append((p, len(stack)+1))
458                lastch = ch
459                p = _match_stringre(str, p, q).end()
460                bracketing.append((p, len(stack)))
461                continue
462
463            if ch == '#':
464                # consume comment and trailing newline
465                bracketing.append((p, len(stack)+1))
466                p = str.find('\n', p, q) + 1
467                assert p > 0
468                bracketing.append((p, len(stack)))
469                continue
470
471            assert ch == '\\'
472            p = p+1     # beyond backslash
473            assert p < q
474            if str[p] != '\n':
475                # the program is invalid, but can't complain
476                lastch = ch + str[p]
477            p = p+1     # beyond escaped char
478
479        # end while p < q:
480
481        self.lastch = lastch
482        if stack:
483            self.lastopenbracketpos = stack[-1]
484        self.stmt_bracketing = tuple(bracketing)
485
486    # Assuming continuation is C_BRACKET, return the number
487    # of spaces the next line should be indented.
488
489    def compute_bracket_indent(self):
490        self._study2()
491        assert self.continuation == C_BRACKET
492        j = self.lastopenbracketpos
493        str = self.str
494        n = len(str)
495        origi = i = str.rfind('\n', 0, j) + 1
496        j = j+1     # one beyond open bracket
497        # find first list item; set i to start of its line
498        while j < n:
499            m = _itemre(str, j)
500            if m:
501                j = m.end() - 1     # index of first interesting char
502                extra = 0
503                break
504            else:
505                # this line is junk; advance to next line
506                i = j = str.find('\n', j) + 1
507        else:
508            # nothing interesting follows the bracket;
509            # reproduce the bracket line's indentation + a level
510            j = i = origi
511            while str[j] in " \t":
512                j = j+1
513            extra = self.indentwidth
514        return len(str[i:j].expandtabs(self.tabwidth)) + extra
515
516    # Return number of physical lines in last stmt (whether or not
517    # it's an interesting stmt!  this is intended to be called when
518    # continuation is C_BACKSLASH).
519
520    def get_num_lines_in_stmt(self):
521        self._study1()
522        goodlines = self.goodlines
523        return goodlines[-1] - goodlines[-2]
524
525    # Assuming continuation is C_BACKSLASH, return the number of spaces
526    # the next line should be indented.  Also assuming the new line is
527    # the first one following the initial line of the stmt.
528
529    def compute_backslash_indent(self):
530        self._study2()
531        assert self.continuation == C_BACKSLASH
532        str = self.str
533        i = self.stmt_start
534        while str[i] in " \t":
535            i = i+1
536        startpos = i
537
538        # See whether the initial line starts an assignment stmt; i.e.,
539        # look for an = operator
540        endpos = str.find('\n', startpos) + 1
541        found = level = 0
542        while i < endpos:
543            ch = str[i]
544            if ch in "([{":
545                level = level + 1
546                i = i+1
547            elif ch in ")]}":
548                if level:
549                    level = level - 1
550                i = i+1
551            elif ch == '"' or ch == "'":
552                i = _match_stringre(str, i, endpos).end()
553            elif ch == '#':
554                break
555            elif level == 0 and ch == '=' and \
556                   (i == 0 or str[i-1] not in "=<>!") and \
557                   str[i+1] != '=':
558                found = 1
559                break
560            else:
561                i = i+1
562
563        if found:
564            # found a legit =, but it may be the last interesting
565            # thing on the line
566            i = i+1     # move beyond the =
567            found = re.match(r"\s*\\", str[i:endpos]) is None
568
569        if not found:
570            # oh well ... settle for moving beyond the first chunk
571            # of non-whitespace chars
572            i = startpos
573            while str[i] not in " \t\n":
574                i = i+1
575
576        return len(str[self.stmt_start:i].expandtabs(\
577                                     self.tabwidth)) + 1
578
579    # Return the leading whitespace on the initial line of the last
580    # interesting stmt.
581
582    def get_base_indent_string(self):
583        self._study2()
584        i, n = self.stmt_start, self.stmt_end
585        j = i
586        str = self.str
587        while j < n and str[j] in " \t":
588            j = j + 1
589        return str[i:j]
590
591    # Did the last interesting stmt open a block?
592
593    def is_block_opener(self):
594        self._study2()
595        return self.lastch == ':'
596
597    # Did the last interesting stmt close a block?
598
599    def is_block_closer(self):
600        self._study2()
601        return _closere(self.str, self.stmt_start) is not None
602
603    # index of last open bracket ({[, or None if none
604    lastopenbracketpos = None
605
606    def get_last_open_bracket_pos(self):
607        self._study2()
608        return self.lastopenbracketpos
609
610    # the structure of the bracketing of the last interesting statement,
611    # in the format defined in _study2, or None if the text didn't contain
612    # anything
613    stmt_bracketing = None
614
615    def get_last_stmt_bracketing(self):
616        self._study2()
617        return self.stmt_bracketing
618