1"""ANTLR3 runtime package"""
2
3# begin[licence]
4#
5# [The "BSD licence"]
6# Copyright (c) 2005-2012 Terence Parr
7# All rights reserved.
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. The name of the author may not be used to endorse or promote products
18#    derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30#
31# end[licence]
32
33from io import StringIO
34
35from .constants import DEFAULT_CHANNEL, EOF
36from .tokens import Token
37
38
39############################################################################
40#
41# basic interfaces
42#   IntStream
43#    +- CharStream
44#    \- TokenStream
45#
46# subclasses must implemented all methods
47#
48############################################################################
49
50class IntStream(object):
51    """
52    @brief Base interface for streams of integer values.
53
54    A simple stream of integers used when all I care about is the char
55    or token type sequence (such as interpretation).
56    """
57
58    def consume(self):
59        raise NotImplementedError
60
61
62    def LA(self, i):
63        """Get int at current input pointer + i ahead where i=1 is next int.
64
65        Negative indexes are allowed.  LA(-1) is previous token (token
66        just matched).  LA(-i) where i is before first token should
67        yield -1, invalid char / EOF.
68        """
69
70        raise NotImplementedError
71
72
73    def mark(self):
74        """
75        Tell the stream to start buffering if it hasn't already.  Return
76        current input position, index(), or some other marker so that
77        when passed to rewind() you get back to the same spot.
78        rewind(mark()) should not affect the input cursor.  The Lexer
79        track line/col info as well as input index so its markers are
80        not pure input indexes.  Same for tree node streams.
81        """
82
83        raise NotImplementedError
84
85
86    def index(self):
87        """
88        Return the current input symbol index 0..n where n indicates the
89        last symbol has been read.  The index is the symbol about to be
90        read not the most recently read symbol.
91        """
92
93        raise NotImplementedError
94
95
96    def rewind(self, marker=None):
97        """
98        Reset the stream so that next call to index would return marker.
99        The marker will usually be index() but it doesn't have to be.  It's
100        just a marker to indicate what state the stream was in.  This is
101        essentially calling release() and seek().  If there are markers
102        created after this marker argument, this routine must unroll them
103        like a stack.  Assume the state the stream was in when this marker
104        was created.
105
106        If marker is None:
107        Rewind to the input position of the last marker.
108        Used currently only after a cyclic DFA and just
109        before starting a sem/syn predicate to get the
110        input position back to the start of the decision.
111        Do not "pop" the marker off the state.  mark(i)
112        and rewind(i) should balance still. It is
113        like invoking rewind(last marker) but it should not "pop"
114        the marker off.  It's like seek(last marker's input position).
115        """
116
117        raise NotImplementedError
118
119
120    def release(self, marker=None):
121        """
122        You may want to commit to a backtrack but don't want to force the
123        stream to keep bookkeeping objects around for a marker that is
124        no longer necessary.  This will have the same behavior as
125        rewind() except it releases resources without the backward seek.
126        This must throw away resources for all markers back to the marker
127        argument.  So if you're nested 5 levels of mark(), and then release(2)
128        you have to release resources for depths 2..5.
129        """
130
131        raise NotImplementedError
132
133
134    def seek(self, index):
135        """
136        Set the input cursor to the position indicated by index.  This is
137        normally used to seek ahead in the input stream.  No buffering is
138        required to do this unless you know your stream will use seek to
139        move backwards such as when backtracking.
140
141        This is different from rewind in its multi-directional
142        requirement and in that its argument is strictly an input cursor
143        (index).
144
145        For char streams, seeking forward must update the stream state such
146        as line number.  For seeking backwards, you will be presumably
147        backtracking using the mark/rewind mechanism that restores state and
148        so this method does not need to update state when seeking backwards.
149
150        Currently, this method is only used for efficient backtracking using
151        memoization, but in the future it may be used for incremental parsing.
152
153        The index is 0..n-1.  A seek to position i means that LA(1) will
154        return the ith symbol.  So, seeking to 0 means LA(1) will return the
155        first element in the stream.
156        """
157
158        raise NotImplementedError
159
160
161    def size(self):
162        """
163        Only makes sense for streams that buffer everything up probably, but
164        might be useful to display the entire stream or for testing.  This
165        value includes a single EOF.
166        """
167
168        raise NotImplementedError
169
170
171    def getSourceName(self):
172        """
173        Where are you getting symbols from?  Normally, implementations will
174        pass the buck all the way to the lexer who can ask its input stream
175        for the file name or whatever.
176        """
177
178        raise NotImplementedError
179
180
181class CharStream(IntStream):
182    """
183    @brief A source of characters for an ANTLR lexer.
184
185    This is an abstract class that must be implemented by a subclass.
186
187    """
188
189    # pylint does not realize that this is an interface, too
190    #pylint: disable-msg=W0223
191
192    EOF = -1
193
194    def __init__(self):
195        # line number 1..n within the input
196        self._line = 1
197
198        # The index of the character relative to the beginning of the
199        # line 0..n-1
200        self._charPositionInLine = 0
201
202
203    def substring(self, start, stop):
204        """
205        For infinite streams, you don't need this; primarily I'm providing
206        a useful interface for action code.  Just make sure actions don't
207        use this on streams that don't support it.
208        """
209
210        raise NotImplementedError
211
212
213    def LT(self, i):
214        """
215        Get the ith character of lookahead.  This is the same usually as
216        LA(i).  This will be used for labels in the generated
217        lexer code.  I'd prefer to return a char here type-wise, but it's
218        probably better to be 32-bit clean and be consistent with LA.
219        """
220
221        raise NotImplementedError
222
223
224    @property
225    def line(self):
226        """ANTLR tracks the line information automatically"""
227        return self._line
228
229    @line.setter
230    def line(self, value):
231        """
232        Because this stream can rewind, we need to be able to reset the line
233        """
234        self._line = value
235
236
237    @property
238    def charPositionInLine(self):
239        """
240        The index of the character relative to the beginning of the line 0..n-1
241        """
242        return self._charPositionInLine
243
244    @charPositionInLine.setter
245    def charPositionInLine(self, pos):
246        self._charPositionInLine = pos
247
248
249class TokenStream(IntStream):
250    """
251
252    @brief A stream of tokens accessing tokens from a TokenSource
253
254    This is an abstract class that must be implemented by a subclass.
255
256    """
257
258    # pylint does not realize that this is an interface, too
259    #pylint: disable-msg=W0223
260
261    def LT(self, k):
262        """
263        Get Token at current input pointer + i ahead where i=1 is next Token.
264        i<0 indicates tokens in the past.  So -1 is previous token and -2 is
265        two tokens ago. LT(0) is undefined.  For i>=n, return Token.EOFToken.
266        Return null for LT(0) and any index that results in an absolute address
267        that is negative.
268        """
269
270        raise NotImplementedError
271
272
273    def range(self):
274        """
275        How far ahead has the stream been asked to look?  The return
276        value is a valid index from 0..n-1.
277        """
278
279        raise NotImplementedError
280
281
282    def get(self, i):
283        """
284        Get a token at an absolute index i; 0..n-1.  This is really only
285        needed for profiling and debugging and token stream rewriting.
286        If you don't want to buffer up tokens, then this method makes no
287        sense for you.  Naturally you can't use the rewrite stream feature.
288        I believe DebugTokenStream can easily be altered to not use
289        this method, removing the dependency.
290        """
291
292        raise NotImplementedError
293
294
295    def getTokenSource(self):
296        """
297        Where is this stream pulling tokens from?  This is not the name, but
298        the object that provides Token objects.
299        """
300
301        raise NotImplementedError
302
303
304    def toString(self, start=None, stop=None):
305        """
306        Return the text of all tokens from start to stop, inclusive.
307        If the stream does not buffer all the tokens then it can just
308        return "" or null;  Users should not access $ruleLabel.text in
309        an action of course in that case.
310
311        Because the user is not required to use a token with an index stored
312        in it, we must provide a means for two token objects themselves to
313        indicate the start/end location.  Most often this will just delegate
314        to the other toString(int,int).  This is also parallel with
315        the TreeNodeStream.toString(Object,Object).
316        """
317
318        raise NotImplementedError
319
320
321############################################################################
322#
323# character streams for use in lexers
324#   CharStream
325#   \- ANTLRStringStream
326#
327############################################################################
328
329
330class ANTLRStringStream(CharStream):
331    """
332    @brief CharStream that pull data from a unicode string.
333
334    A pretty quick CharStream that pulls all data from an array
335    directly.  Every method call counts in the lexer.
336
337    """
338
339
340    def __init__(self, data):
341        """
342        @param data This should be a unicode string holding the data you want
343        to parse. If you pass in a byte string, the Lexer will choke on
344        non-ascii data.
345        """
346
347        super().__init__()
348
349        # The data being scanned
350        self.strdata = str(data)
351        self.data = [ord(c) for c in self.strdata]
352
353        # How many characters are actually in the buffer
354        self.n = len(data)
355
356        # 0..n-1 index into string of next char
357        self.p = 0
358
359        # A list of CharStreamState objects that tracks the stream state
360        # values line, charPositionInLine, and p that can change as you
361        # move through the input stream.  Indexed from 0..markDepth-1.
362        self._markers = [ ]
363        self.lastMarker = None
364        self.markDepth = 0
365
366        # What is name or source of this char stream?
367        self.name = None
368
369
370    def reset(self):
371        """
372        Reset the stream so that it's in the same state it was
373        when the object was created *except* the data array is not
374        touched.
375        """
376
377        self.p = 0
378        self._line = 1
379        self.charPositionInLine = 0
380        self._markers = [ ]
381        self.lastMarker = None
382        self.markDepth = 0
383
384
385    def consume(self):
386        if self.p < self.n:
387            if self.data[self.p] == 10: # ord('\n')
388                self._line += 1
389                self.charPositionInLine = 0
390            else:
391                self.charPositionInLine += 1
392
393            self.p += 1
394
395        # else we reached EOF
396        # just do nothing
397
398
399    def LA(self, i):
400        if i == 0:
401            return 0 # undefined
402
403        if i < 0:
404            i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
405
406        if self.p + i - 1 < self.n:
407            return self.data[self.p + i - 1]
408        else:
409            return EOF
410
411
412
413    def LT(self, i):
414        if i == 0:
415            return 0 # undefined
416
417        if i < 0:
418            i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
419
420        if self.p + i - 1 < self.n:
421            return self.strdata[self.p + i - 1]
422        else:
423            return EOF
424
425
426    def index(self):
427        """
428        Return the current input symbol index 0..n where n indicates the
429        last symbol has been read.  The index is the index of char to
430        be returned from LA(1).
431        """
432
433        return self.p
434
435
436    def size(self):
437        return self.n
438
439
440    def mark(self):
441        state = (self.p, self.line, self.charPositionInLine)
442        if self.markDepth < len(self._markers):
443            self._markers[self.markDepth] = state
444        else:
445            self._markers.append(state)
446        self.markDepth += 1
447
448        self.lastMarker = self.markDepth
449
450        return self.lastMarker
451
452
453    def rewind(self, marker=None):
454        if marker is None:
455            marker = self.lastMarker
456
457        p, line, charPositionInLine = self._markers[marker - 1]
458
459        self.seek(p)
460        self._line = line
461        self.charPositionInLine = charPositionInLine
462        self.release(marker)
463
464
465    def release(self, marker=None):
466        if marker is None:
467            marker = self.lastMarker
468
469        self.markDepth = marker - 1
470
471
472    def seek(self, index):
473        """
474        consume() ahead until p==index; can't just set p=index as we must
475        update line and charPositionInLine.
476        """
477
478        if index <= self.p:
479            self.p = index # just jump; don't update stream state (line, ...)
480            return
481
482        # seek forward, consume until p hits index
483        while self.p < index:
484            self.consume()
485
486
487    def substring(self, start, stop):
488        return self.strdata[start:stop + 1]
489
490
491    def getSourceName(self):
492        return self.name
493
494
495class ANTLRFileStream(ANTLRStringStream):
496    """
497    @brief CharStream that opens a file to read the data.
498
499    This is a char buffer stream that is loaded from a file
500    all at once when you construct the object.
501    """
502
503    def __init__(self, fileName):
504        """
505        @param fileName The path to the file to be opened. The file will be
506           opened with mode 'r'.
507
508        """
509
510        self._fileName = fileName
511
512        with open(fileName, 'r') as fp:
513            super().__init__(fp.read())
514
515
516    @property
517    def fileName(self):
518        return self._fileName
519
520
521class ANTLRInputStream(ANTLRStringStream):
522    """
523    @brief CharStream that reads data from a file-like object.
524
525    This is a char buffer stream that is loaded from a file like object
526    all at once when you construct the object.
527
528    All input is consumed from the file, but it is not closed.
529    """
530
531    def __init__(self, file):
532        """
533        @param file A file-like object holding your input. Only the read()
534           method must be implemented.
535
536        """
537
538        data = file.read()
539
540        super().__init__(data)
541
542
543# I guess the ANTLR prefix exists only to avoid a name clash with some Java
544# mumbojumbo. A plain "StringStream" looks better to me, which should be
545# the preferred name in Python.
546StringStream = ANTLRStringStream
547FileStream = ANTLRFileStream
548InputStream = ANTLRInputStream
549
550
551############################################################################
552#
553# Token streams
554#   TokenStream
555#   +- CommonTokenStream
556#   \- TokenRewriteStream
557#
558############################################################################
559
560
561class CommonTokenStream(TokenStream):
562    """
563    @brief The most common stream of tokens
564
565    The most common stream of tokens is one where every token is buffered up
566    and tokens are prefiltered for a certain channel (the parser will only
567    see these tokens and cannot change the filter channel number during the
568    parse).
569    """
570
571    def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
572        """
573        @param tokenSource A TokenSource instance (usually a Lexer) to pull
574            the tokens from.
575
576        @param channel Skip tokens on any channel but this one; this is how we
577            skip whitespace...
578
579        """
580
581        super().__init__()
582
583        self.tokenSource = tokenSource
584
585        # Record every single token pulled from the source so we can reproduce
586        # chunks of it later.
587        self.tokens = []
588
589        # Map<tokentype, channel> to override some Tokens' channel numbers
590        self.channelOverrideMap = {}
591
592        # Set<tokentype>; discard any tokens with this type
593        self.discardSet = set()
594
595        # Skip tokens on any channel but this one; this is how we skip
596        # whitespace...
597        self.channel = channel
598
599        # By default, track all incoming tokens
600        self.discardOffChannelTokens = False
601
602        # The index into the tokens list of the current token (next token
603        # to consume).  p==-1 indicates that the tokens list is empty
604        self.p = -1
605
606        # Remember last marked position
607        self.lastMarker = None
608
609        # how deep have we gone?
610        self._range = -1
611
612
613    def makeEOFToken(self):
614        return self.tokenSource.makeEOFToken()
615
616
617    def setTokenSource(self, tokenSource):
618        """Reset this token stream by setting its token source."""
619
620        self.tokenSource = tokenSource
621        self.tokens = []
622        self.p = -1
623        self.channel = DEFAULT_CHANNEL
624
625
626    def reset(self):
627        self.p = 0
628        self.lastMarker = None
629
630
631    def fillBuffer(self):
632        """
633        Load all tokens from the token source and put in tokens.
634        This is done upon first LT request because you might want to
635        set some token type / channel overrides before filling buffer.
636        """
637
638
639        index = 0
640        t = self.tokenSource.nextToken()
641        while t and t.type != EOF:
642            discard = False
643
644            if self.discardSet and t.type in self.discardSet:
645                discard = True
646
647            elif self.discardOffChannelTokens and t.channel != self.channel:
648                discard = True
649
650            # is there a channel override for token type?
651            if t.type in self.channelOverrideMap:
652                overrideChannel = self.channelOverrideMap[t.type]
653
654                if overrideChannel == self.channel:
655                    t.channel = overrideChannel
656                else:
657                    discard = True
658
659            if not discard:
660                t.index = index
661                self.tokens.append(t)
662                index += 1
663
664            t = self.tokenSource.nextToken()
665
666        # leave p pointing at first token on channel
667        self.p = 0
668        self.p = self.skipOffTokenChannels(self.p)
669
670
671    def consume(self):
672        """
673        Move the input pointer to the next incoming token.  The stream
674        must become active with LT(1) available.  consume() simply
675        moves the input pointer so that LT(1) points at the next
676        input symbol. Consume at least one token.
677
678        Walk past any token not on the channel the parser is listening to.
679        """
680
681        if self.p < len(self.tokens):
682            self.p += 1
683
684            self.p = self.skipOffTokenChannels(self.p) # leave p on valid token
685
686
687    def skipOffTokenChannels(self, i):
688        """
689        Given a starting index, return the index of the first on-channel
690        token.
691        """
692
693        n = len(self.tokens)
694        while i < n and self.tokens[i].channel != self.channel:
695            i += 1
696
697        return i
698
699
700    def skipOffTokenChannelsReverse(self, i):
701        while i >= 0 and self.tokens[i].channel != self.channel:
702            i -= 1
703
704        return i
705
706
707    def setTokenTypeChannel(self, ttype, channel):
708        """
709        A simple filter mechanism whereby you can tell this token stream
710        to force all tokens of type ttype to be on channel.  For example,
711        when interpreting, we cannot exec actions so we need to tell
712        the stream to force all WS and NEWLINE to be a different, ignored
713        channel.
714        """
715
716        self.channelOverrideMap[ttype] = channel
717
718
719    def discardTokenType(self, ttype):
720        self.discardSet.add(ttype)
721
722
723    def getTokens(self, start=None, stop=None, types=None):
724        """
725        Given a start and stop index, return a list of all tokens in
726        the token type set.  Return None if no tokens were found.  This
727        method looks at both on and off channel tokens.
728        """
729
730        if self.p == -1:
731            self.fillBuffer()
732
733        if stop is None or stop > len(self.tokens):
734            stop = len(self.tokens)
735
736        if start is None or start < 0:
737            start = 0
738
739        if start > stop:
740            return None
741
742        if isinstance(types, int):
743            # called with a single type, wrap into set
744            types = set([types])
745
746        filteredTokens = [
747            token for token in self.tokens[start:stop]
748            if types is None or token.type in types
749            ]
750
751        if len(filteredTokens) == 0:
752            return None
753
754        return filteredTokens
755
756
757    def LT(self, k):
758        """
759        Get the ith token from the current position 1..n where k=1 is the
760        first symbol of lookahead.
761        """
762
763        if self.p == -1:
764            self.fillBuffer()
765
766        if k == 0:
767            return None
768
769        if k < 0:
770            return self.LB(-k)
771
772        i = self.p
773        n = 1
774        # find k good tokens
775        while n < k:
776            # skip off-channel tokens
777            i = self.skipOffTokenChannels(i + 1) # leave p on valid token
778            n += 1
779
780        if i > self._range:
781            self._range = i
782
783        if i < len(self.tokens):
784            return self.tokens[i]
785        else:
786            return self.makeEOFToken()
787
788
789    def LB(self, k):
790        """Look backwards k tokens on-channel tokens"""
791
792        if self.p == -1:
793            self.fillBuffer()
794
795        if k == 0:
796            return None
797
798        if self.p - k < 0:
799            return None
800
801        i = self.p
802        n = 1
803        # find k good tokens looking backwards
804        while n <= k:
805            # skip off-channel tokens
806            i = self.skipOffTokenChannelsReverse(i - 1) # leave p on valid token
807            n += 1
808
809        if i < 0:
810            return None
811
812        return self.tokens[i]
813
814
815    def get(self, i):
816        """
817        Return absolute token i; ignore which channel the tokens are on;
818        that is, count all tokens not just on-channel tokens.
819        """
820
821        return self.tokens[i]
822
823
824    def slice(self, start, stop):
825        if self.p == -1:
826            self.fillBuffer()
827
828        if start < 0 or stop < 0:
829            return None
830
831        return self.tokens[start:stop + 1]
832
833
834    def LA(self, i):
835        return self.LT(i).type
836
837
838    def mark(self):
839        self.lastMarker = self.index()
840        return self.lastMarker
841
842
843    def release(self, marker=None):
844        # no resources to release
845        pass
846
847
848    def size(self):
849        return len(self.tokens)
850
851
852    def range(self):
853        return self._range
854
855
856    def index(self):
857        return self.p
858
859
860    def rewind(self, marker=None):
861        if marker is None:
862            marker = self.lastMarker
863
864        self.seek(marker)
865
866
867    def seek(self, index):
868        self.p = index
869
870
871    def getTokenSource(self):
872        return self.tokenSource
873
874
875    def getSourceName(self):
876        return self.tokenSource.getSourceName()
877
878
879    def toString(self, start=None, stop=None):
880        """Returns a string of all tokens between start and stop (inclusive)."""
881        if self.p == -1:
882            self.fillBuffer()
883
884        if start is None:
885            start = 0
886        elif not isinstance(start, int):
887            start = start.index
888
889        if stop is None:
890            stop = len(self.tokens) - 1
891        elif not isinstance(stop, int):
892            stop = stop.index
893
894        if stop >= len(self.tokens):
895            stop = len(self.tokens) - 1
896
897        return ''.join([t.text for t in self.tokens[start:stop + 1]])
898
899
900class RewriteOperation(object):
901    """@brief Internal helper class."""
902
903    def __init__(self, stream, index, text):
904        self.stream = stream
905
906        # What index into rewrites List are we?
907        self.instructionIndex = None
908
909        # Token buffer index.
910        self.index = index
911        self.text = text
912
913    def execute(self, buf):
914        """Execute the rewrite operation by possibly adding to the buffer.
915        Return the index of the next token to operate on.
916        """
917
918        return self.index
919
920    def toString(self):
921        opName = self.__class__.__name__
922        return '<{opName}@{0.index}:"{0.text}">'.format(self, opName=opName)
923
924    __str__ = toString
925    __repr__ = toString
926
927
928class InsertBeforeOp(RewriteOperation):
929    """@brief Internal helper class."""
930
931    def execute(self, buf):
932        buf.write(self.text)
933        if self.stream.tokens[self.index].type != EOF:
934            buf.write(self.stream.tokens[self.index].text)
935        return self.index + 1
936
937
938class ReplaceOp(RewriteOperation):
939    """
940    @brief Internal helper class.
941
942    I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp
943    instructions.
944    """
945
946    def __init__(self, stream, first, last, text):
947        super().__init__(stream, first, text)
948        self.lastIndex = last
949
950
951    def execute(self, buf):
952        if self.text is not None:
953            buf.write(self.text)
954
955        return self.lastIndex + 1
956
957
958    def toString(self):
959        if self.text is None:
960            return '<DeleteOp@{0.index}..{0.lastindex}>'.format(self)
961
962        return '<ReplaceOp@{0.index}..{0.lastIndex}:"{0.text}">'.format(self)
963
964    __str__ = toString
965    __repr__ = toString
966
967
968class TokenRewriteStream(CommonTokenStream):
969    """@brief CommonTokenStream that can be modified.
970
971    Useful for dumping out the input stream after doing some
972    augmentation or other manipulations.
973
974    You can insert stuff, replace, and delete chunks.  Note that the
975    operations are done lazily--only if you convert the buffer to a
976    String.  This is very efficient because you are not moving data around
977    all the time.  As the buffer of tokens is converted to strings, the
978    toString() method(s) check to see if there is an operation at the
979    current index.  If so, the operation is done and then normal String
980    rendering continues on the buffer.  This is like having multiple Turing
981    machine instruction streams (programs) operating on a single input tape. :)
982
983    Since the operations are done lazily at toString-time, operations do not
984    screw up the token index values.  That is, an insert operation at token
985    index i does not change the index values for tokens i+1..n-1.
986
987    Because operations never actually alter the buffer, you may always get
988    the original token stream back without undoing anything.  Since
989    the instructions are queued up, you can easily simulate transactions and
990    roll back any changes if there is an error just by removing instructions.
991    For example,
992
993     CharStream input = new ANTLRFileStream("input");
994     TLexer lex = new TLexer(input);
995     TokenRewriteStream tokens = new TokenRewriteStream(lex);
996     T parser = new T(tokens);
997     parser.startRule();
998
999     Then in the rules, you can execute
1000        Token t,u;
1001        ...
1002        input.insertAfter(t, "text to put after t");}
1003        input.insertAfter(u, "text after u");}
1004        System.out.println(tokens.toString());
1005
1006    Actually, you have to cast the 'input' to a TokenRewriteStream. :(
1007
1008    You can also have multiple "instruction streams" and get multiple
1009    rewrites from a single pass over the input.  Just name the instruction
1010    streams and use that name again when printing the buffer.  This could be
1011    useful for generating a C file and also its header file--all from the
1012    same buffer:
1013
1014        tokens.insertAfter("pass1", t, "text to put after t");}
1015        tokens.insertAfter("pass2", u, "text after u");}
1016        System.out.println(tokens.toString("pass1"));
1017        System.out.println(tokens.toString("pass2"));
1018
1019    If you don't use named rewrite streams, a "default" stream is used as
1020    the first example shows.
1021    """
1022
1023    DEFAULT_PROGRAM_NAME = "default"
1024    MIN_TOKEN_INDEX = 0
1025
1026    def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
1027        super().__init__(tokenSource, channel)
1028
1029        # You may have multiple, named streams of rewrite operations.
1030        # I'm calling these things "programs."
1031        #  Maps String (name) -> rewrite (List)
1032        self.programs = {}
1033        self.programs[self.DEFAULT_PROGRAM_NAME] = []
1034
1035        # Map String (program name) -> Integer index
1036        self.lastRewriteTokenIndexes = {}
1037
1038
1039    def rollback(self, *args):
1040        """
1041        Rollback the instruction stream for a program so that
1042        the indicated instruction (via instructionIndex) is no
1043        longer in the stream.  UNTESTED!
1044        """
1045
1046        if len(args) == 2:
1047            programName = args[0]
1048            instructionIndex = args[1]
1049        elif len(args) == 1:
1050            programName = self.DEFAULT_PROGRAM_NAME
1051            instructionIndex = args[0]
1052        else:
1053            raise TypeError("Invalid arguments")
1054
1055        p = self.programs.get(programName)
1056        if p:
1057            self.programs[programName] = (
1058                p[self.MIN_TOKEN_INDEX:instructionIndex])
1059
1060
1061    def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME):
1062        """Reset the program so that no instructions exist"""
1063
1064        self.rollback(programName, self.MIN_TOKEN_INDEX)
1065
1066
1067    def insertAfter(self, *args):
1068        if len(args) == 2:
1069            programName = self.DEFAULT_PROGRAM_NAME
1070            index = args[0]
1071            text = args[1]
1072
1073        elif len(args) == 3:
1074            programName = args[0]
1075            index = args[1]
1076            text = args[2]
1077
1078        else:
1079            raise TypeError("Invalid arguments")
1080
1081        if isinstance(index, Token):
1082            # index is a Token, grap the stream index from it
1083            index = index.index
1084
1085        # to insert after, just insert before next index (even if past end)
1086        self.insertBefore(programName, index + 1, text)
1087
1088
1089    def insertBefore(self, *args):
1090        if len(args) == 2:
1091            programName = self.DEFAULT_PROGRAM_NAME
1092            index = args[0]
1093            text = args[1]
1094
1095        elif len(args) == 3:
1096            programName = args[0]
1097            index = args[1]
1098            text = args[2]
1099
1100        else:
1101            raise TypeError("Invalid arguments")
1102
1103        if isinstance(index, Token):
1104            # index is a Token, grab the stream index from it
1105            index = index.index
1106
1107        op = InsertBeforeOp(self, index, text)
1108        rewrites = self.getProgram(programName)
1109        op.instructionIndex = len(rewrites)
1110        rewrites.append(op)
1111
1112
1113    def replace(self, *args):
1114        if len(args) == 2:
1115            programName = self.DEFAULT_PROGRAM_NAME
1116            first = args[0]
1117            last = args[0]
1118            text = args[1]
1119
1120        elif len(args) == 3:
1121            programName = self.DEFAULT_PROGRAM_NAME
1122            first = args[0]
1123            last = args[1]
1124            text = args[2]
1125
1126        elif len(args) == 4:
1127            programName = args[0]
1128            first = args[1]
1129            last = args[2]
1130            text = args[3]
1131
1132        else:
1133            raise TypeError("Invalid arguments")
1134
1135        if isinstance(first, Token):
1136            # first is a Token, grap the stream index from it
1137            first = first.index
1138
1139        if isinstance(last, Token):
1140            # last is a Token, grap the stream index from it
1141            last = last.index
1142
1143        if first > last or first < 0 or last < 0 or last >= len(self.tokens):
1144            raise ValueError(
1145                "replace: range invalid: {}..{} (size={})"
1146                .format(first, last, len(self.tokens)))
1147
1148        op = ReplaceOp(self, first, last, text)
1149        rewrites = self.getProgram(programName)
1150        op.instructionIndex = len(rewrites)
1151        rewrites.append(op)
1152
1153
1154    def delete(self, *args):
1155        self.replace(*(list(args) + [None]))
1156
1157
1158    def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME):
1159        return self.lastRewriteTokenIndexes.get(programName, -1)
1160
1161
1162    def setLastRewriteTokenIndex(self, programName, i):
1163        self.lastRewriteTokenIndexes[programName] = i
1164
1165
1166    def getProgram(self, name):
1167        p = self.programs.get(name)
1168        if not p:
1169            p = self.initializeProgram(name)
1170
1171        return p
1172
1173
1174    def initializeProgram(self, name):
1175        p = []
1176        self.programs[name] = p
1177        return p
1178
1179
1180    def toOriginalString(self, start=None, end=None):
1181        if self.p == -1:
1182            self.fillBuffer()
1183
1184        if start is None:
1185            start = self.MIN_TOKEN_INDEX
1186        if end is None:
1187            end = self.size() - 1
1188
1189        buf = StringIO()
1190        i = start
1191        while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
1192            if self.get(i).type != EOF:
1193                buf.write(self.get(i).text)
1194            i += 1
1195
1196        return buf.getvalue()
1197
1198
1199    def toString(self, *args):
1200        if self.p == -1:
1201            self.fillBuffer()
1202
1203        if len(args) == 0:
1204            programName = self.DEFAULT_PROGRAM_NAME
1205            start = self.MIN_TOKEN_INDEX
1206            end = self.size() - 1
1207
1208        elif len(args) == 1:
1209            programName = args[0]
1210            start = self.MIN_TOKEN_INDEX
1211            end = self.size() - 1
1212
1213        elif len(args) == 2:
1214            programName = self.DEFAULT_PROGRAM_NAME
1215            start = args[0]
1216            end = args[1]
1217
1218        if start is None:
1219            start = self.MIN_TOKEN_INDEX
1220        elif not isinstance(start, int):
1221            start = start.index
1222
1223        if end is None:
1224            end = len(self.tokens) - 1
1225        elif not isinstance(end, int):
1226            end = end.index
1227
1228        # ensure start/end are in range
1229        if end >= len(self.tokens):
1230            end = len(self.tokens) - 1
1231
1232        if start < 0:
1233            start = 0
1234
1235        rewrites = self.programs.get(programName)
1236        if not rewrites:
1237            # no instructions to execute
1238            return self.toOriginalString(start, end)
1239
1240        buf = StringIO()
1241
1242        # First, optimize instruction stream
1243        indexToOp = self.reduceToSingleOperationPerIndex(rewrites)
1244
1245        # Walk buffer, executing instructions and emitting tokens
1246        i = start
1247        while i <= end and i < len(self.tokens):
1248            # remove so any left have index size-1
1249            op = indexToOp.pop(i, None)
1250
1251            t = self.tokens[i]
1252            if op is None:
1253                # no operation at that index, just dump token
1254                if t.type != EOF:
1255                    buf.write(t.text)
1256                i += 1 # move to next token
1257
1258            else:
1259                i = op.execute(buf) # execute operation and skip
1260
1261        # include stuff after end if it's last index in buffer
1262        # So, if they did an insertAfter(lastValidIndex, "foo"), include
1263        # foo if end == lastValidIndex.
1264        if end == len(self.tokens) - 1:
1265            # Scan any remaining operations after last token
1266            # should be included (they will be inserts).
1267            for i, op in sorted(indexToOp.items()):
1268                if op.index >= len(self.tokens) - 1:
1269                    buf.write(op.text)
1270
1271        return buf.getvalue()
1272
1273    __str__ = toString
1274
1275
1276    def reduceToSingleOperationPerIndex(self, rewrites):
1277        """
1278        We need to combine operations and report invalid operations (like
1279        overlapping replaces that are not completed nested).  Inserts to
1280        same index need to be combined etc...   Here are the cases:
1281
1282        I.i.u I.j.v                           leave alone, nonoverlapping
1283        I.i.u I.i.v                           combine: Iivu
1284
1285        R.i-j.u R.x-y.v | i-j in x-y          delete first R
1286        R.i-j.u R.i-j.v                       delete first R
1287        R.i-j.u R.x-y.v | x-y in i-j          ERROR
1288        R.i-j.u R.x-y.v | boundaries overlap  ERROR
1289
1290        Delete special case of replace (text==null):
1291        D.i-j.u D.x-y.v |                     boundaries overlapcombine to
1292                                              max(min)..max(right)
1293
1294        I.i.u R.x-y.v   |                     i in (x+1)-ydelete I (since
1295                                              insert before we're not deleting
1296                                              i)
1297        I.i.u R.x-y.v   |                     i not in (x+1)-yleave alone,
1298                                              nonoverlapping
1299
1300        R.x-y.v I.i.u   | i in x-y            ERROR
1301        R.x-y.v I.x.u                         R.x-y.uv (combine, delete I)
1302        R.x-y.v I.i.u   | i not in x-y        leave alone, nonoverlapping
1303
1304        I.i.u = insert u before op @ index i
1305        R.x-y.u = replace x-y indexed tokens with u
1306
1307        First we need to examine replaces.  For any replace op:
1308
1309          1. wipe out any insertions before op within that range.
1310          2. Drop any replace op before that is contained completely within
1311             that range.
1312          3. Throw exception upon boundary overlap with any previous replace.
1313
1314        Then we can deal with inserts:
1315
1316          1. for any inserts to same index, combine even if not adjacent.
1317          2. for any prior replace with same left boundary, combine this
1318             insert with replace and delete this replace.
1319          3. throw exception if index in same range as previous replace
1320
1321        Don't actually delete; make op null in list. Easier to walk list.
1322        Later we can throw as we add to index -> op map.
1323
1324        Note that I.2 R.2-2 will wipe out I.2 even though, technically, the
1325        inserted stuff would be before the replace range.  But, if you
1326        add tokens in front of a method body '{' and then delete the method
1327        body, I think the stuff before the '{' you added should disappear too.
1328
1329        Return a map from token index to operation.
1330        """
1331
1332        # WALK REPLACES
1333        for i, rop in enumerate(rewrites):
1334            if not rop:
1335                continue
1336
1337            if not isinstance(rop, ReplaceOp):
1338                continue
1339
1340            # Wipe prior inserts within range
1341            for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
1342                if iop.index == rop.index:
1343                    # E.g., insert before 2, delete 2..2; update replace
1344                    # text to include insert before, kill insert
1345                    rewrites[iop.instructionIndex] = None
1346                    rop.text = self.catOpText(iop.text, rop.text)
1347
1348                elif iop.index > rop.index and iop.index <= rop.lastIndex:
1349                    # delete insert as it's a no-op.
1350                    rewrites[j] = None
1351
1352            # Drop any prior replaces contained within
1353            for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i):
1354                if (prevRop.index >= rop.index
1355                    and prevRop.lastIndex <= rop.lastIndex):
1356                    # delete replace as it's a no-op.
1357                    rewrites[j] = None
1358                    continue
1359
1360                # throw exception unless disjoint or identical
1361                disjoint = (prevRop.lastIndex < rop.index
1362                            or prevRop.index > rop.lastIndex)
1363                same = (prevRop.index == rop.index
1364                        and prevRop.lastIndex == rop.lastIndex)
1365
1366                # Delete special case of replace (text==null):
1367                # D.i-j.u D.x-y.v| boundaries overlapcombine to
1368                # max(min)..max(right)
1369                if prevRop.text is None and rop.text is None and not disjoint:
1370                    # kill first delete
1371                    rewrites[prevRop.instructionIndex] = None
1372
1373                    rop.index = min(prevRop.index, rop.index)
1374                    rop.lastIndex = max(prevRop.lastIndex, rop.lastIndex)
1375
1376                elif not disjoint and not same:
1377                    raise ValueError(
1378                        "replace op boundaries of {} overlap with previous {}"
1379                        .format(rop, prevRop))
1380
1381        # WALK INSERTS
1382        for i, iop in enumerate(rewrites):
1383            if iop is None:
1384                continue
1385
1386            if not isinstance(iop, InsertBeforeOp):
1387                continue
1388
1389            # combine current insert with prior if any at same index
1390            for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
1391                if prevIop.index == iop.index: # combine objects
1392                    # convert to strings...we're in process of toString'ing
1393                    # whole token buffer so no lazy eval issue with any
1394                    # templates
1395                    iop.text = self.catOpText(iop.text, prevIop.text)
1396                    # delete redundant prior insert
1397                    rewrites[j] = None
1398
1399            # look for replaces where iop.index is in range; error
1400            for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i):
1401                if iop.index == rop.index:
1402                    rop.text = self.catOpText(iop.text, rop.text)
1403                    # delete current insert
1404                    rewrites[i] = None
1405                    continue
1406
1407                if iop.index >= rop.index and iop.index <= rop.lastIndex:
1408                    raise ValueError(
1409                        "insert op {} within boundaries of previous {}"
1410                        .format(iop, rop))
1411
1412        m = {}
1413        for i, op in enumerate(rewrites):
1414            if op is None:
1415                # ignore deleted ops
1416                continue
1417
1418            assert op.index not in m, "should only be one op per index"
1419            m[op.index] = op
1420
1421        return m
1422
1423
1424    def catOpText(self, a, b):
1425        x = ""
1426        y = ""
1427        if a:
1428            x = a
1429        if b:
1430            y = b
1431        return x + y
1432
1433
1434    def getKindOfOps(self, rewrites, kind, before=None):
1435        """Get all operations before an index of a particular kind."""
1436
1437        if before is None:
1438            before = len(rewrites)
1439        elif before > len(rewrites):
1440            before = len(rewrites)
1441
1442        for i, op in enumerate(rewrites[:before]):
1443            # ignore deleted
1444            if op and op.__class__ == kind:
1445                yield i, op
1446
1447
1448    def toDebugString(self, start=None, end=None):
1449        if start is None:
1450            start = self.MIN_TOKEN_INDEX
1451        if end is None:
1452            end = self.size() - 1
1453
1454        buf = StringIO()
1455        i = start
1456        while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
1457            buf.write(self.get(i))
1458            i += 1
1459
1460        return buf.getvalue()
1461