1"""ANTLR3 runtime package""" 2 3# begin[licence] 4# 5# [The "BSD licence"] 6# Copyright (c) 2005-2012 Terence Parr 7# All rights reserved. 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 3. The name of the author may not be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30# 31# end[licence] 32 33from io import StringIO 34 35from .constants import DEFAULT_CHANNEL, EOF 36from .tokens import Token 37 38 39############################################################################ 40# 41# basic interfaces 42# IntStream 43# +- CharStream 44# \- TokenStream 45# 46# subclasses must implemented all methods 47# 48############################################################################ 49 50class IntStream(object): 51 """ 52 @brief Base interface for streams of integer values. 53 54 A simple stream of integers used when all I care about is the char 55 or token type sequence (such as interpretation). 56 """ 57 58 def consume(self): 59 raise NotImplementedError 60 61 62 def LA(self, i): 63 """Get int at current input pointer + i ahead where i=1 is next int. 64 65 Negative indexes are allowed. LA(-1) is previous token (token 66 just matched). LA(-i) where i is before first token should 67 yield -1, invalid char / EOF. 68 """ 69 70 raise NotImplementedError 71 72 73 def mark(self): 74 """ 75 Tell the stream to start buffering if it hasn't already. Return 76 current input position, index(), or some other marker so that 77 when passed to rewind() you get back to the same spot. 78 rewind(mark()) should not affect the input cursor. The Lexer 79 track line/col info as well as input index so its markers are 80 not pure input indexes. Same for tree node streams. 81 """ 82 83 raise NotImplementedError 84 85 86 def index(self): 87 """ 88 Return the current input symbol index 0..n where n indicates the 89 last symbol has been read. The index is the symbol about to be 90 read not the most recently read symbol. 91 """ 92 93 raise NotImplementedError 94 95 96 def rewind(self, marker=None): 97 """ 98 Reset the stream so that next call to index would return marker. 99 The marker will usually be index() but it doesn't have to be. It's 100 just a marker to indicate what state the stream was in. This is 101 essentially calling release() and seek(). If there are markers 102 created after this marker argument, this routine must unroll them 103 like a stack. Assume the state the stream was in when this marker 104 was created. 105 106 If marker is None: 107 Rewind to the input position of the last marker. 108 Used currently only after a cyclic DFA and just 109 before starting a sem/syn predicate to get the 110 input position back to the start of the decision. 111 Do not "pop" the marker off the state. mark(i) 112 and rewind(i) should balance still. It is 113 like invoking rewind(last marker) but it should not "pop" 114 the marker off. It's like seek(last marker's input position). 115 """ 116 117 raise NotImplementedError 118 119 120 def release(self, marker=None): 121 """ 122 You may want to commit to a backtrack but don't want to force the 123 stream to keep bookkeeping objects around for a marker that is 124 no longer necessary. This will have the same behavior as 125 rewind() except it releases resources without the backward seek. 126 This must throw away resources for all markers back to the marker 127 argument. So if you're nested 5 levels of mark(), and then release(2) 128 you have to release resources for depths 2..5. 129 """ 130 131 raise NotImplementedError 132 133 134 def seek(self, index): 135 """ 136 Set the input cursor to the position indicated by index. This is 137 normally used to seek ahead in the input stream. No buffering is 138 required to do this unless you know your stream will use seek to 139 move backwards such as when backtracking. 140 141 This is different from rewind in its multi-directional 142 requirement and in that its argument is strictly an input cursor 143 (index). 144 145 For char streams, seeking forward must update the stream state such 146 as line number. For seeking backwards, you will be presumably 147 backtracking using the mark/rewind mechanism that restores state and 148 so this method does not need to update state when seeking backwards. 149 150 Currently, this method is only used for efficient backtracking using 151 memoization, but in the future it may be used for incremental parsing. 152 153 The index is 0..n-1. A seek to position i means that LA(1) will 154 return the ith symbol. So, seeking to 0 means LA(1) will return the 155 first element in the stream. 156 """ 157 158 raise NotImplementedError 159 160 161 def size(self): 162 """ 163 Only makes sense for streams that buffer everything up probably, but 164 might be useful to display the entire stream or for testing. This 165 value includes a single EOF. 166 """ 167 168 raise NotImplementedError 169 170 171 def getSourceName(self): 172 """ 173 Where are you getting symbols from? Normally, implementations will 174 pass the buck all the way to the lexer who can ask its input stream 175 for the file name or whatever. 176 """ 177 178 raise NotImplementedError 179 180 181class CharStream(IntStream): 182 """ 183 @brief A source of characters for an ANTLR lexer. 184 185 This is an abstract class that must be implemented by a subclass. 186 187 """ 188 189 # pylint does not realize that this is an interface, too 190 #pylint: disable-msg=W0223 191 192 EOF = -1 193 194 def __init__(self): 195 # line number 1..n within the input 196 self._line = 1 197 198 # The index of the character relative to the beginning of the 199 # line 0..n-1 200 self._charPositionInLine = 0 201 202 203 def substring(self, start, stop): 204 """ 205 For infinite streams, you don't need this; primarily I'm providing 206 a useful interface for action code. Just make sure actions don't 207 use this on streams that don't support it. 208 """ 209 210 raise NotImplementedError 211 212 213 def LT(self, i): 214 """ 215 Get the ith character of lookahead. This is the same usually as 216 LA(i). This will be used for labels in the generated 217 lexer code. I'd prefer to return a char here type-wise, but it's 218 probably better to be 32-bit clean and be consistent with LA. 219 """ 220 221 raise NotImplementedError 222 223 224 @property 225 def line(self): 226 """ANTLR tracks the line information automatically""" 227 return self._line 228 229 @line.setter 230 def line(self, value): 231 """ 232 Because this stream can rewind, we need to be able to reset the line 233 """ 234 self._line = value 235 236 237 @property 238 def charPositionInLine(self): 239 """ 240 The index of the character relative to the beginning of the line 0..n-1 241 """ 242 return self._charPositionInLine 243 244 @charPositionInLine.setter 245 def charPositionInLine(self, pos): 246 self._charPositionInLine = pos 247 248 249class TokenStream(IntStream): 250 """ 251 252 @brief A stream of tokens accessing tokens from a TokenSource 253 254 This is an abstract class that must be implemented by a subclass. 255 256 """ 257 258 # pylint does not realize that this is an interface, too 259 #pylint: disable-msg=W0223 260 261 def LT(self, k): 262 """ 263 Get Token at current input pointer + i ahead where i=1 is next Token. 264 i<0 indicates tokens in the past. So -1 is previous token and -2 is 265 two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken. 266 Return null for LT(0) and any index that results in an absolute address 267 that is negative. 268 """ 269 270 raise NotImplementedError 271 272 273 def range(self): 274 """ 275 How far ahead has the stream been asked to look? The return 276 value is a valid index from 0..n-1. 277 """ 278 279 raise NotImplementedError 280 281 282 def get(self, i): 283 """ 284 Get a token at an absolute index i; 0..n-1. This is really only 285 needed for profiling and debugging and token stream rewriting. 286 If you don't want to buffer up tokens, then this method makes no 287 sense for you. Naturally you can't use the rewrite stream feature. 288 I believe DebugTokenStream can easily be altered to not use 289 this method, removing the dependency. 290 """ 291 292 raise NotImplementedError 293 294 295 def getTokenSource(self): 296 """ 297 Where is this stream pulling tokens from? This is not the name, but 298 the object that provides Token objects. 299 """ 300 301 raise NotImplementedError 302 303 304 def toString(self, start=None, stop=None): 305 """ 306 Return the text of all tokens from start to stop, inclusive. 307 If the stream does not buffer all the tokens then it can just 308 return "" or null; Users should not access $ruleLabel.text in 309 an action of course in that case. 310 311 Because the user is not required to use a token with an index stored 312 in it, we must provide a means for two token objects themselves to 313 indicate the start/end location. Most often this will just delegate 314 to the other toString(int,int). This is also parallel with 315 the TreeNodeStream.toString(Object,Object). 316 """ 317 318 raise NotImplementedError 319 320 321############################################################################ 322# 323# character streams for use in lexers 324# CharStream 325# \- ANTLRStringStream 326# 327############################################################################ 328 329 330class ANTLRStringStream(CharStream): 331 """ 332 @brief CharStream that pull data from a unicode string. 333 334 A pretty quick CharStream that pulls all data from an array 335 directly. Every method call counts in the lexer. 336 337 """ 338 339 340 def __init__(self, data): 341 """ 342 @param data This should be a unicode string holding the data you want 343 to parse. If you pass in a byte string, the Lexer will choke on 344 non-ascii data. 345 """ 346 347 super().__init__() 348 349 # The data being scanned 350 self.strdata = str(data) 351 self.data = [ord(c) for c in self.strdata] 352 353 # How many characters are actually in the buffer 354 self.n = len(data) 355 356 # 0..n-1 index into string of next char 357 self.p = 0 358 359 # A list of CharStreamState objects that tracks the stream state 360 # values line, charPositionInLine, and p that can change as you 361 # move through the input stream. Indexed from 0..markDepth-1. 362 self._markers = [ ] 363 self.lastMarker = None 364 self.markDepth = 0 365 366 # What is name or source of this char stream? 367 self.name = None 368 369 370 def reset(self): 371 """ 372 Reset the stream so that it's in the same state it was 373 when the object was created *except* the data array is not 374 touched. 375 """ 376 377 self.p = 0 378 self._line = 1 379 self.charPositionInLine = 0 380 self._markers = [ ] 381 self.lastMarker = None 382 self.markDepth = 0 383 384 385 def consume(self): 386 if self.p < self.n: 387 if self.data[self.p] == 10: # ord('\n') 388 self._line += 1 389 self.charPositionInLine = 0 390 else: 391 self.charPositionInLine += 1 392 393 self.p += 1 394 395 # else we reached EOF 396 # just do nothing 397 398 399 def LA(self, i): 400 if i == 0: 401 return 0 # undefined 402 403 if i < 0: 404 i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] 405 406 if self.p + i - 1 < self.n: 407 return self.data[self.p + i - 1] 408 else: 409 return EOF 410 411 412 413 def LT(self, i): 414 if i == 0: 415 return 0 # undefined 416 417 if i < 0: 418 i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] 419 420 if self.p + i - 1 < self.n: 421 return self.strdata[self.p + i - 1] 422 else: 423 return EOF 424 425 426 def index(self): 427 """ 428 Return the current input symbol index 0..n where n indicates the 429 last symbol has been read. The index is the index of char to 430 be returned from LA(1). 431 """ 432 433 return self.p 434 435 436 def size(self): 437 return self.n 438 439 440 def mark(self): 441 state = (self.p, self.line, self.charPositionInLine) 442 if self.markDepth < len(self._markers): 443 self._markers[self.markDepth] = state 444 else: 445 self._markers.append(state) 446 self.markDepth += 1 447 448 self.lastMarker = self.markDepth 449 450 return self.lastMarker 451 452 453 def rewind(self, marker=None): 454 if marker is None: 455 marker = self.lastMarker 456 457 p, line, charPositionInLine = self._markers[marker - 1] 458 459 self.seek(p) 460 self._line = line 461 self.charPositionInLine = charPositionInLine 462 self.release(marker) 463 464 465 def release(self, marker=None): 466 if marker is None: 467 marker = self.lastMarker 468 469 self.markDepth = marker - 1 470 471 472 def seek(self, index): 473 """ 474 consume() ahead until p==index; can't just set p=index as we must 475 update line and charPositionInLine. 476 """ 477 478 if index <= self.p: 479 self.p = index # just jump; don't update stream state (line, ...) 480 return 481 482 # seek forward, consume until p hits index 483 while self.p < index: 484 self.consume() 485 486 487 def substring(self, start, stop): 488 return self.strdata[start:stop + 1] 489 490 491 def getSourceName(self): 492 return self.name 493 494 495class ANTLRFileStream(ANTLRStringStream): 496 """ 497 @brief CharStream that opens a file to read the data. 498 499 This is a char buffer stream that is loaded from a file 500 all at once when you construct the object. 501 """ 502 503 def __init__(self, fileName): 504 """ 505 @param fileName The path to the file to be opened. The file will be 506 opened with mode 'r'. 507 508 """ 509 510 self._fileName = fileName 511 512 with open(fileName, 'r') as fp: 513 super().__init__(fp.read()) 514 515 516 @property 517 def fileName(self): 518 return self._fileName 519 520 521class ANTLRInputStream(ANTLRStringStream): 522 """ 523 @brief CharStream that reads data from a file-like object. 524 525 This is a char buffer stream that is loaded from a file like object 526 all at once when you construct the object. 527 528 All input is consumed from the file, but it is not closed. 529 """ 530 531 def __init__(self, file): 532 """ 533 @param file A file-like object holding your input. Only the read() 534 method must be implemented. 535 536 """ 537 538 data = file.read() 539 540 super().__init__(data) 541 542 543# I guess the ANTLR prefix exists only to avoid a name clash with some Java 544# mumbojumbo. A plain "StringStream" looks better to me, which should be 545# the preferred name in Python. 546StringStream = ANTLRStringStream 547FileStream = ANTLRFileStream 548InputStream = ANTLRInputStream 549 550 551############################################################################ 552# 553# Token streams 554# TokenStream 555# +- CommonTokenStream 556# \- TokenRewriteStream 557# 558############################################################################ 559 560 561class CommonTokenStream(TokenStream): 562 """ 563 @brief The most common stream of tokens 564 565 The most common stream of tokens is one where every token is buffered up 566 and tokens are prefiltered for a certain channel (the parser will only 567 see these tokens and cannot change the filter channel number during the 568 parse). 569 """ 570 571 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): 572 """ 573 @param tokenSource A TokenSource instance (usually a Lexer) to pull 574 the tokens from. 575 576 @param channel Skip tokens on any channel but this one; this is how we 577 skip whitespace... 578 579 """ 580 581 super().__init__() 582 583 self.tokenSource = tokenSource 584 585 # Record every single token pulled from the source so we can reproduce 586 # chunks of it later. 587 self.tokens = [] 588 589 # Map<tokentype, channel> to override some Tokens' channel numbers 590 self.channelOverrideMap = {} 591 592 # Set<tokentype>; discard any tokens with this type 593 self.discardSet = set() 594 595 # Skip tokens on any channel but this one; this is how we skip 596 # whitespace... 597 self.channel = channel 598 599 # By default, track all incoming tokens 600 self.discardOffChannelTokens = False 601 602 # The index into the tokens list of the current token (next token 603 # to consume). p==-1 indicates that the tokens list is empty 604 self.p = -1 605 606 # Remember last marked position 607 self.lastMarker = None 608 609 # how deep have we gone? 610 self._range = -1 611 612 613 def makeEOFToken(self): 614 return self.tokenSource.makeEOFToken() 615 616 617 def setTokenSource(self, tokenSource): 618 """Reset this token stream by setting its token source.""" 619 620 self.tokenSource = tokenSource 621 self.tokens = [] 622 self.p = -1 623 self.channel = DEFAULT_CHANNEL 624 625 626 def reset(self): 627 self.p = 0 628 self.lastMarker = None 629 630 631 def fillBuffer(self): 632 """ 633 Load all tokens from the token source and put in tokens. 634 This is done upon first LT request because you might want to 635 set some token type / channel overrides before filling buffer. 636 """ 637 638 639 index = 0 640 t = self.tokenSource.nextToken() 641 while t and t.type != EOF: 642 discard = False 643 644 if self.discardSet and t.type in self.discardSet: 645 discard = True 646 647 elif self.discardOffChannelTokens and t.channel != self.channel: 648 discard = True 649 650 # is there a channel override for token type? 651 if t.type in self.channelOverrideMap: 652 overrideChannel = self.channelOverrideMap[t.type] 653 654 if overrideChannel == self.channel: 655 t.channel = overrideChannel 656 else: 657 discard = True 658 659 if not discard: 660 t.index = index 661 self.tokens.append(t) 662 index += 1 663 664 t = self.tokenSource.nextToken() 665 666 # leave p pointing at first token on channel 667 self.p = 0 668 self.p = self.skipOffTokenChannels(self.p) 669 670 671 def consume(self): 672 """ 673 Move the input pointer to the next incoming token. The stream 674 must become active with LT(1) available. consume() simply 675 moves the input pointer so that LT(1) points at the next 676 input symbol. Consume at least one token. 677 678 Walk past any token not on the channel the parser is listening to. 679 """ 680 681 if self.p < len(self.tokens): 682 self.p += 1 683 684 self.p = self.skipOffTokenChannels(self.p) # leave p on valid token 685 686 687 def skipOffTokenChannels(self, i): 688 """ 689 Given a starting index, return the index of the first on-channel 690 token. 691 """ 692 693 n = len(self.tokens) 694 while i < n and self.tokens[i].channel != self.channel: 695 i += 1 696 697 return i 698 699 700 def skipOffTokenChannelsReverse(self, i): 701 while i >= 0 and self.tokens[i].channel != self.channel: 702 i -= 1 703 704 return i 705 706 707 def setTokenTypeChannel(self, ttype, channel): 708 """ 709 A simple filter mechanism whereby you can tell this token stream 710 to force all tokens of type ttype to be on channel. For example, 711 when interpreting, we cannot exec actions so we need to tell 712 the stream to force all WS and NEWLINE to be a different, ignored 713 channel. 714 """ 715 716 self.channelOverrideMap[ttype] = channel 717 718 719 def discardTokenType(self, ttype): 720 self.discardSet.add(ttype) 721 722 723 def getTokens(self, start=None, stop=None, types=None): 724 """ 725 Given a start and stop index, return a list of all tokens in 726 the token type set. Return None if no tokens were found. This 727 method looks at both on and off channel tokens. 728 """ 729 730 if self.p == -1: 731 self.fillBuffer() 732 733 if stop is None or stop > len(self.tokens): 734 stop = len(self.tokens) 735 736 if start is None or start < 0: 737 start = 0 738 739 if start > stop: 740 return None 741 742 if isinstance(types, int): 743 # called with a single type, wrap into set 744 types = set([types]) 745 746 filteredTokens = [ 747 token for token in self.tokens[start:stop] 748 if types is None or token.type in types 749 ] 750 751 if len(filteredTokens) == 0: 752 return None 753 754 return filteredTokens 755 756 757 def LT(self, k): 758 """ 759 Get the ith token from the current position 1..n where k=1 is the 760 first symbol of lookahead. 761 """ 762 763 if self.p == -1: 764 self.fillBuffer() 765 766 if k == 0: 767 return None 768 769 if k < 0: 770 return self.LB(-k) 771 772 i = self.p 773 n = 1 774 # find k good tokens 775 while n < k: 776 # skip off-channel tokens 777 i = self.skipOffTokenChannels(i + 1) # leave p on valid token 778 n += 1 779 780 if i > self._range: 781 self._range = i 782 783 if i < len(self.tokens): 784 return self.tokens[i] 785 else: 786 return self.makeEOFToken() 787 788 789 def LB(self, k): 790 """Look backwards k tokens on-channel tokens""" 791 792 if self.p == -1: 793 self.fillBuffer() 794 795 if k == 0: 796 return None 797 798 if self.p - k < 0: 799 return None 800 801 i = self.p 802 n = 1 803 # find k good tokens looking backwards 804 while n <= k: 805 # skip off-channel tokens 806 i = self.skipOffTokenChannelsReverse(i - 1) # leave p on valid token 807 n += 1 808 809 if i < 0: 810 return None 811 812 return self.tokens[i] 813 814 815 def get(self, i): 816 """ 817 Return absolute token i; ignore which channel the tokens are on; 818 that is, count all tokens not just on-channel tokens. 819 """ 820 821 return self.tokens[i] 822 823 824 def slice(self, start, stop): 825 if self.p == -1: 826 self.fillBuffer() 827 828 if start < 0 or stop < 0: 829 return None 830 831 return self.tokens[start:stop + 1] 832 833 834 def LA(self, i): 835 return self.LT(i).type 836 837 838 def mark(self): 839 self.lastMarker = self.index() 840 return self.lastMarker 841 842 843 def release(self, marker=None): 844 # no resources to release 845 pass 846 847 848 def size(self): 849 return len(self.tokens) 850 851 852 def range(self): 853 return self._range 854 855 856 def index(self): 857 return self.p 858 859 860 def rewind(self, marker=None): 861 if marker is None: 862 marker = self.lastMarker 863 864 self.seek(marker) 865 866 867 def seek(self, index): 868 self.p = index 869 870 871 def getTokenSource(self): 872 return self.tokenSource 873 874 875 def getSourceName(self): 876 return self.tokenSource.getSourceName() 877 878 879 def toString(self, start=None, stop=None): 880 """Returns a string of all tokens between start and stop (inclusive).""" 881 if self.p == -1: 882 self.fillBuffer() 883 884 if start is None: 885 start = 0 886 elif not isinstance(start, int): 887 start = start.index 888 889 if stop is None: 890 stop = len(self.tokens) - 1 891 elif not isinstance(stop, int): 892 stop = stop.index 893 894 if stop >= len(self.tokens): 895 stop = len(self.tokens) - 1 896 897 return ''.join([t.text for t in self.tokens[start:stop + 1]]) 898 899 900class RewriteOperation(object): 901 """@brief Internal helper class.""" 902 903 def __init__(self, stream, index, text): 904 self.stream = stream 905 906 # What index into rewrites List are we? 907 self.instructionIndex = None 908 909 # Token buffer index. 910 self.index = index 911 self.text = text 912 913 def execute(self, buf): 914 """Execute the rewrite operation by possibly adding to the buffer. 915 Return the index of the next token to operate on. 916 """ 917 918 return self.index 919 920 def toString(self): 921 opName = self.__class__.__name__ 922 return '<{opName}@{0.index}:"{0.text}">'.format(self, opName=opName) 923 924 __str__ = toString 925 __repr__ = toString 926 927 928class InsertBeforeOp(RewriteOperation): 929 """@brief Internal helper class.""" 930 931 def execute(self, buf): 932 buf.write(self.text) 933 if self.stream.tokens[self.index].type != EOF: 934 buf.write(self.stream.tokens[self.index].text) 935 return self.index + 1 936 937 938class ReplaceOp(RewriteOperation): 939 """ 940 @brief Internal helper class. 941 942 I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp 943 instructions. 944 """ 945 946 def __init__(self, stream, first, last, text): 947 super().__init__(stream, first, text) 948 self.lastIndex = last 949 950 951 def execute(self, buf): 952 if self.text is not None: 953 buf.write(self.text) 954 955 return self.lastIndex + 1 956 957 958 def toString(self): 959 if self.text is None: 960 return '<DeleteOp@{0.index}..{0.lastindex}>'.format(self) 961 962 return '<ReplaceOp@{0.index}..{0.lastIndex}:"{0.text}">'.format(self) 963 964 __str__ = toString 965 __repr__ = toString 966 967 968class TokenRewriteStream(CommonTokenStream): 969 """@brief CommonTokenStream that can be modified. 970 971 Useful for dumping out the input stream after doing some 972 augmentation or other manipulations. 973 974 You can insert stuff, replace, and delete chunks. Note that the 975 operations are done lazily--only if you convert the buffer to a 976 String. This is very efficient because you are not moving data around 977 all the time. As the buffer of tokens is converted to strings, the 978 toString() method(s) check to see if there is an operation at the 979 current index. If so, the operation is done and then normal String 980 rendering continues on the buffer. This is like having multiple Turing 981 machine instruction streams (programs) operating on a single input tape. :) 982 983 Since the operations are done lazily at toString-time, operations do not 984 screw up the token index values. That is, an insert operation at token 985 index i does not change the index values for tokens i+1..n-1. 986 987 Because operations never actually alter the buffer, you may always get 988 the original token stream back without undoing anything. Since 989 the instructions are queued up, you can easily simulate transactions and 990 roll back any changes if there is an error just by removing instructions. 991 For example, 992 993 CharStream input = new ANTLRFileStream("input"); 994 TLexer lex = new TLexer(input); 995 TokenRewriteStream tokens = new TokenRewriteStream(lex); 996 T parser = new T(tokens); 997 parser.startRule(); 998 999 Then in the rules, you can execute 1000 Token t,u; 1001 ... 1002 input.insertAfter(t, "text to put after t");} 1003 input.insertAfter(u, "text after u");} 1004 System.out.println(tokens.toString()); 1005 1006 Actually, you have to cast the 'input' to a TokenRewriteStream. :( 1007 1008 You can also have multiple "instruction streams" and get multiple 1009 rewrites from a single pass over the input. Just name the instruction 1010 streams and use that name again when printing the buffer. This could be 1011 useful for generating a C file and also its header file--all from the 1012 same buffer: 1013 1014 tokens.insertAfter("pass1", t, "text to put after t");} 1015 tokens.insertAfter("pass2", u, "text after u");} 1016 System.out.println(tokens.toString("pass1")); 1017 System.out.println(tokens.toString("pass2")); 1018 1019 If you don't use named rewrite streams, a "default" stream is used as 1020 the first example shows. 1021 """ 1022 1023 DEFAULT_PROGRAM_NAME = "default" 1024 MIN_TOKEN_INDEX = 0 1025 1026 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): 1027 super().__init__(tokenSource, channel) 1028 1029 # You may have multiple, named streams of rewrite operations. 1030 # I'm calling these things "programs." 1031 # Maps String (name) -> rewrite (List) 1032 self.programs = {} 1033 self.programs[self.DEFAULT_PROGRAM_NAME] = [] 1034 1035 # Map String (program name) -> Integer index 1036 self.lastRewriteTokenIndexes = {} 1037 1038 1039 def rollback(self, *args): 1040 """ 1041 Rollback the instruction stream for a program so that 1042 the indicated instruction (via instructionIndex) is no 1043 longer in the stream. UNTESTED! 1044 """ 1045 1046 if len(args) == 2: 1047 programName = args[0] 1048 instructionIndex = args[1] 1049 elif len(args) == 1: 1050 programName = self.DEFAULT_PROGRAM_NAME 1051 instructionIndex = args[0] 1052 else: 1053 raise TypeError("Invalid arguments") 1054 1055 p = self.programs.get(programName) 1056 if p: 1057 self.programs[programName] = ( 1058 p[self.MIN_TOKEN_INDEX:instructionIndex]) 1059 1060 1061 def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME): 1062 """Reset the program so that no instructions exist""" 1063 1064 self.rollback(programName, self.MIN_TOKEN_INDEX) 1065 1066 1067 def insertAfter(self, *args): 1068 if len(args) == 2: 1069 programName = self.DEFAULT_PROGRAM_NAME 1070 index = args[0] 1071 text = args[1] 1072 1073 elif len(args) == 3: 1074 programName = args[0] 1075 index = args[1] 1076 text = args[2] 1077 1078 else: 1079 raise TypeError("Invalid arguments") 1080 1081 if isinstance(index, Token): 1082 # index is a Token, grap the stream index from it 1083 index = index.index 1084 1085 # to insert after, just insert before next index (even if past end) 1086 self.insertBefore(programName, index + 1, text) 1087 1088 1089 def insertBefore(self, *args): 1090 if len(args) == 2: 1091 programName = self.DEFAULT_PROGRAM_NAME 1092 index = args[0] 1093 text = args[1] 1094 1095 elif len(args) == 3: 1096 programName = args[0] 1097 index = args[1] 1098 text = args[2] 1099 1100 else: 1101 raise TypeError("Invalid arguments") 1102 1103 if isinstance(index, Token): 1104 # index is a Token, grab the stream index from it 1105 index = index.index 1106 1107 op = InsertBeforeOp(self, index, text) 1108 rewrites = self.getProgram(programName) 1109 op.instructionIndex = len(rewrites) 1110 rewrites.append(op) 1111 1112 1113 def replace(self, *args): 1114 if len(args) == 2: 1115 programName = self.DEFAULT_PROGRAM_NAME 1116 first = args[0] 1117 last = args[0] 1118 text = args[1] 1119 1120 elif len(args) == 3: 1121 programName = self.DEFAULT_PROGRAM_NAME 1122 first = args[0] 1123 last = args[1] 1124 text = args[2] 1125 1126 elif len(args) == 4: 1127 programName = args[0] 1128 first = args[1] 1129 last = args[2] 1130 text = args[3] 1131 1132 else: 1133 raise TypeError("Invalid arguments") 1134 1135 if isinstance(first, Token): 1136 # first is a Token, grap the stream index from it 1137 first = first.index 1138 1139 if isinstance(last, Token): 1140 # last is a Token, grap the stream index from it 1141 last = last.index 1142 1143 if first > last or first < 0 or last < 0 or last >= len(self.tokens): 1144 raise ValueError( 1145 "replace: range invalid: {}..{} (size={})" 1146 .format(first, last, len(self.tokens))) 1147 1148 op = ReplaceOp(self, first, last, text) 1149 rewrites = self.getProgram(programName) 1150 op.instructionIndex = len(rewrites) 1151 rewrites.append(op) 1152 1153 1154 def delete(self, *args): 1155 self.replace(*(list(args) + [None])) 1156 1157 1158 def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME): 1159 return self.lastRewriteTokenIndexes.get(programName, -1) 1160 1161 1162 def setLastRewriteTokenIndex(self, programName, i): 1163 self.lastRewriteTokenIndexes[programName] = i 1164 1165 1166 def getProgram(self, name): 1167 p = self.programs.get(name) 1168 if not p: 1169 p = self.initializeProgram(name) 1170 1171 return p 1172 1173 1174 def initializeProgram(self, name): 1175 p = [] 1176 self.programs[name] = p 1177 return p 1178 1179 1180 def toOriginalString(self, start=None, end=None): 1181 if self.p == -1: 1182 self.fillBuffer() 1183 1184 if start is None: 1185 start = self.MIN_TOKEN_INDEX 1186 if end is None: 1187 end = self.size() - 1 1188 1189 buf = StringIO() 1190 i = start 1191 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): 1192 if self.get(i).type != EOF: 1193 buf.write(self.get(i).text) 1194 i += 1 1195 1196 return buf.getvalue() 1197 1198 1199 def toString(self, *args): 1200 if self.p == -1: 1201 self.fillBuffer() 1202 1203 if len(args) == 0: 1204 programName = self.DEFAULT_PROGRAM_NAME 1205 start = self.MIN_TOKEN_INDEX 1206 end = self.size() - 1 1207 1208 elif len(args) == 1: 1209 programName = args[0] 1210 start = self.MIN_TOKEN_INDEX 1211 end = self.size() - 1 1212 1213 elif len(args) == 2: 1214 programName = self.DEFAULT_PROGRAM_NAME 1215 start = args[0] 1216 end = args[1] 1217 1218 if start is None: 1219 start = self.MIN_TOKEN_INDEX 1220 elif not isinstance(start, int): 1221 start = start.index 1222 1223 if end is None: 1224 end = len(self.tokens) - 1 1225 elif not isinstance(end, int): 1226 end = end.index 1227 1228 # ensure start/end are in range 1229 if end >= len(self.tokens): 1230 end = len(self.tokens) - 1 1231 1232 if start < 0: 1233 start = 0 1234 1235 rewrites = self.programs.get(programName) 1236 if not rewrites: 1237 # no instructions to execute 1238 return self.toOriginalString(start, end) 1239 1240 buf = StringIO() 1241 1242 # First, optimize instruction stream 1243 indexToOp = self.reduceToSingleOperationPerIndex(rewrites) 1244 1245 # Walk buffer, executing instructions and emitting tokens 1246 i = start 1247 while i <= end and i < len(self.tokens): 1248 # remove so any left have index size-1 1249 op = indexToOp.pop(i, None) 1250 1251 t = self.tokens[i] 1252 if op is None: 1253 # no operation at that index, just dump token 1254 if t.type != EOF: 1255 buf.write(t.text) 1256 i += 1 # move to next token 1257 1258 else: 1259 i = op.execute(buf) # execute operation and skip 1260 1261 # include stuff after end if it's last index in buffer 1262 # So, if they did an insertAfter(lastValidIndex, "foo"), include 1263 # foo if end == lastValidIndex. 1264 if end == len(self.tokens) - 1: 1265 # Scan any remaining operations after last token 1266 # should be included (they will be inserts). 1267 for i, op in sorted(indexToOp.items()): 1268 if op.index >= len(self.tokens) - 1: 1269 buf.write(op.text) 1270 1271 return buf.getvalue() 1272 1273 __str__ = toString 1274 1275 1276 def reduceToSingleOperationPerIndex(self, rewrites): 1277 """ 1278 We need to combine operations and report invalid operations (like 1279 overlapping replaces that are not completed nested). Inserts to 1280 same index need to be combined etc... Here are the cases: 1281 1282 I.i.u I.j.v leave alone, nonoverlapping 1283 I.i.u I.i.v combine: Iivu 1284 1285 R.i-j.u R.x-y.v | i-j in x-y delete first R 1286 R.i-j.u R.i-j.v delete first R 1287 R.i-j.u R.x-y.v | x-y in i-j ERROR 1288 R.i-j.u R.x-y.v | boundaries overlap ERROR 1289 1290 Delete special case of replace (text==null): 1291 D.i-j.u D.x-y.v | boundaries overlapcombine to 1292 max(min)..max(right) 1293 1294 I.i.u R.x-y.v | i in (x+1)-ydelete I (since 1295 insert before we're not deleting 1296 i) 1297 I.i.u R.x-y.v | i not in (x+1)-yleave alone, 1298 nonoverlapping 1299 1300 R.x-y.v I.i.u | i in x-y ERROR 1301 R.x-y.v I.x.u R.x-y.uv (combine, delete I) 1302 R.x-y.v I.i.u | i not in x-y leave alone, nonoverlapping 1303 1304 I.i.u = insert u before op @ index i 1305 R.x-y.u = replace x-y indexed tokens with u 1306 1307 First we need to examine replaces. For any replace op: 1308 1309 1. wipe out any insertions before op within that range. 1310 2. Drop any replace op before that is contained completely within 1311 that range. 1312 3. Throw exception upon boundary overlap with any previous replace. 1313 1314 Then we can deal with inserts: 1315 1316 1. for any inserts to same index, combine even if not adjacent. 1317 2. for any prior replace with same left boundary, combine this 1318 insert with replace and delete this replace. 1319 3. throw exception if index in same range as previous replace 1320 1321 Don't actually delete; make op null in list. Easier to walk list. 1322 Later we can throw as we add to index -> op map. 1323 1324 Note that I.2 R.2-2 will wipe out I.2 even though, technically, the 1325 inserted stuff would be before the replace range. But, if you 1326 add tokens in front of a method body '{' and then delete the method 1327 body, I think the stuff before the '{' you added should disappear too. 1328 1329 Return a map from token index to operation. 1330 """ 1331 1332 # WALK REPLACES 1333 for i, rop in enumerate(rewrites): 1334 if not rop: 1335 continue 1336 1337 if not isinstance(rop, ReplaceOp): 1338 continue 1339 1340 # Wipe prior inserts within range 1341 for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i): 1342 if iop.index == rop.index: 1343 # E.g., insert before 2, delete 2..2; update replace 1344 # text to include insert before, kill insert 1345 rewrites[iop.instructionIndex] = None 1346 rop.text = self.catOpText(iop.text, rop.text) 1347 1348 elif iop.index > rop.index and iop.index <= rop.lastIndex: 1349 # delete insert as it's a no-op. 1350 rewrites[j] = None 1351 1352 # Drop any prior replaces contained within 1353 for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i): 1354 if (prevRop.index >= rop.index 1355 and prevRop.lastIndex <= rop.lastIndex): 1356 # delete replace as it's a no-op. 1357 rewrites[j] = None 1358 continue 1359 1360 # throw exception unless disjoint or identical 1361 disjoint = (prevRop.lastIndex < rop.index 1362 or prevRop.index > rop.lastIndex) 1363 same = (prevRop.index == rop.index 1364 and prevRop.lastIndex == rop.lastIndex) 1365 1366 # Delete special case of replace (text==null): 1367 # D.i-j.u D.x-y.v| boundaries overlapcombine to 1368 # max(min)..max(right) 1369 if prevRop.text is None and rop.text is None and not disjoint: 1370 # kill first delete 1371 rewrites[prevRop.instructionIndex] = None 1372 1373 rop.index = min(prevRop.index, rop.index) 1374 rop.lastIndex = max(prevRop.lastIndex, rop.lastIndex) 1375 1376 elif not disjoint and not same: 1377 raise ValueError( 1378 "replace op boundaries of {} overlap with previous {}" 1379 .format(rop, prevRop)) 1380 1381 # WALK INSERTS 1382 for i, iop in enumerate(rewrites): 1383 if iop is None: 1384 continue 1385 1386 if not isinstance(iop, InsertBeforeOp): 1387 continue 1388 1389 # combine current insert with prior if any at same index 1390 for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i): 1391 if prevIop.index == iop.index: # combine objects 1392 # convert to strings...we're in process of toString'ing 1393 # whole token buffer so no lazy eval issue with any 1394 # templates 1395 iop.text = self.catOpText(iop.text, prevIop.text) 1396 # delete redundant prior insert 1397 rewrites[j] = None 1398 1399 # look for replaces where iop.index is in range; error 1400 for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i): 1401 if iop.index == rop.index: 1402 rop.text = self.catOpText(iop.text, rop.text) 1403 # delete current insert 1404 rewrites[i] = None 1405 continue 1406 1407 if iop.index >= rop.index and iop.index <= rop.lastIndex: 1408 raise ValueError( 1409 "insert op {} within boundaries of previous {}" 1410 .format(iop, rop)) 1411 1412 m = {} 1413 for i, op in enumerate(rewrites): 1414 if op is None: 1415 # ignore deleted ops 1416 continue 1417 1418 assert op.index not in m, "should only be one op per index" 1419 m[op.index] = op 1420 1421 return m 1422 1423 1424 def catOpText(self, a, b): 1425 x = "" 1426 y = "" 1427 if a: 1428 x = a 1429 if b: 1430 y = b 1431 return x + y 1432 1433 1434 def getKindOfOps(self, rewrites, kind, before=None): 1435 """Get all operations before an index of a particular kind.""" 1436 1437 if before is None: 1438 before = len(rewrites) 1439 elif before > len(rewrites): 1440 before = len(rewrites) 1441 1442 for i, op in enumerate(rewrites[:before]): 1443 # ignore deleted 1444 if op and op.__class__ == kind: 1445 yield i, op 1446 1447 1448 def toDebugString(self, start=None, end=None): 1449 if start is None: 1450 start = self.MIN_TOKEN_INDEX 1451 if end is None: 1452 end = self.size() - 1 1453 1454 buf = StringIO() 1455 i = start 1456 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): 1457 buf.write(self.get(i)) 1458 i += 1 1459 1460 return buf.getvalue() 1461