1"""ANTLR3 runtime package""" 2 3# begin[licence] 4# 5# [The "BSD licence"] 6# Copyright (c) 2005-2008 Terence Parr 7# All rights reserved. 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 3. The name of the author may not be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30# 31# end[licence] 32 33import codecs 34from StringIO import StringIO 35 36from antlr3.constants import DEFAULT_CHANNEL, EOF 37from antlr3.tokens import Token, CommonToken 38 39 40############################################################################ 41# 42# basic interfaces 43# IntStream 44# +- CharStream 45# \- TokenStream 46# 47# subclasses must implemented all methods 48# 49############################################################################ 50 51class IntStream(object): 52 """ 53 @brief Base interface for streams of integer values. 54 55 A simple stream of integers used when all I care about is the char 56 or token type sequence (such as interpretation). 57 """ 58 59 def consume(self): 60 raise NotImplementedError 61 62 63 def LA(self, i): 64 """Get int at current input pointer + i ahead where i=1 is next int. 65 66 Negative indexes are allowed. LA(-1) is previous token (token 67 just matched). LA(-i) where i is before first token should 68 yield -1, invalid char / EOF. 69 """ 70 71 raise NotImplementedError 72 73 74 def mark(self): 75 """ 76 Tell the stream to start buffering if it hasn't already. Return 77 current input position, index(), or some other marker so that 78 when passed to rewind() you get back to the same spot. 79 rewind(mark()) should not affect the input cursor. The Lexer 80 track line/col info as well as input index so its markers are 81 not pure input indexes. Same for tree node streams. 82 """ 83 84 raise NotImplementedError 85 86 87 def index(self): 88 """ 89 Return the current input symbol index 0..n where n indicates the 90 last symbol has been read. The index is the symbol about to be 91 read not the most recently read symbol. 92 """ 93 94 raise NotImplementedError 95 96 97 def rewind(self, marker=None): 98 """ 99 Reset the stream so that next call to index would return marker. 100 The marker will usually be index() but it doesn't have to be. It's 101 just a marker to indicate what state the stream was in. This is 102 essentially calling release() and seek(). If there are markers 103 created after this marker argument, this routine must unroll them 104 like a stack. Assume the state the stream was in when this marker 105 was created. 106 107 If marker is None: 108 Rewind to the input position of the last marker. 109 Used currently only after a cyclic DFA and just 110 before starting a sem/syn predicate to get the 111 input position back to the start of the decision. 112 Do not "pop" the marker off the state. mark(i) 113 and rewind(i) should balance still. It is 114 like invoking rewind(last marker) but it should not "pop" 115 the marker off. It's like seek(last marker's input position). 116 """ 117 118 raise NotImplementedError 119 120 121 def release(self, marker=None): 122 """ 123 You may want to commit to a backtrack but don't want to force the 124 stream to keep bookkeeping objects around for a marker that is 125 no longer necessary. This will have the same behavior as 126 rewind() except it releases resources without the backward seek. 127 This must throw away resources for all markers back to the marker 128 argument. So if you're nested 5 levels of mark(), and then release(2) 129 you have to release resources for depths 2..5. 130 """ 131 132 raise NotImplementedError 133 134 135 def seek(self, index): 136 """ 137 Set the input cursor to the position indicated by index. This is 138 normally used to seek ahead in the input stream. No buffering is 139 required to do this unless you know your stream will use seek to 140 move backwards such as when backtracking. 141 142 This is different from rewind in its multi-directional 143 requirement and in that its argument is strictly an input cursor 144 (index). 145 146 For char streams, seeking forward must update the stream state such 147 as line number. For seeking backwards, you will be presumably 148 backtracking using the mark/rewind mechanism that restores state and 149 so this method does not need to update state when seeking backwards. 150 151 Currently, this method is only used for efficient backtracking using 152 memoization, but in the future it may be used for incremental parsing. 153 154 The index is 0..n-1. A seek to position i means that LA(1) will 155 return the ith symbol. So, seeking to 0 means LA(1) will return the 156 first element in the stream. 157 """ 158 159 raise NotImplementedError 160 161 162 def size(self): 163 """ 164 Only makes sense for streams that buffer everything up probably, but 165 might be useful to display the entire stream or for testing. This 166 value includes a single EOF. 167 """ 168 169 raise NotImplementedError 170 171 172 def getSourceName(self): 173 """ 174 Where are you getting symbols from? Normally, implementations will 175 pass the buck all the way to the lexer who can ask its input stream 176 for the file name or whatever. 177 """ 178 179 raise NotImplementedError 180 181 182class CharStream(IntStream): 183 """ 184 @brief A source of characters for an ANTLR lexer. 185 186 This is an abstract class that must be implemented by a subclass. 187 188 """ 189 190 # pylint does not realize that this is an interface, too 191 #pylint: disable-msg=W0223 192 193 EOF = -1 194 195 196 def substring(self, start, stop): 197 """ 198 For infinite streams, you don't need this; primarily I'm providing 199 a useful interface for action code. Just make sure actions don't 200 use this on streams that don't support it. 201 """ 202 203 raise NotImplementedError 204 205 206 def LT(self, i): 207 """ 208 Get the ith character of lookahead. This is the same usually as 209 LA(i). This will be used for labels in the generated 210 lexer code. I'd prefer to return a char here type-wise, but it's 211 probably better to be 32-bit clean and be consistent with LA. 212 """ 213 214 raise NotImplementedError 215 216 217 def getLine(self): 218 """ANTLR tracks the line information automatically""" 219 220 raise NotImplementedError 221 222 223 def setLine(self, line): 224 """ 225 Because this stream can rewind, we need to be able to reset the line 226 """ 227 228 raise NotImplementedError 229 230 231 def getCharPositionInLine(self): 232 """ 233 The index of the character relative to the beginning of the line 0..n-1 234 """ 235 236 raise NotImplementedError 237 238 239 def setCharPositionInLine(self, pos): 240 raise NotImplementedError 241 242 243class TokenStream(IntStream): 244 """ 245 246 @brief A stream of tokens accessing tokens from a TokenSource 247 248 This is an abstract class that must be implemented by a subclass. 249 250 """ 251 252 # pylint does not realize that this is an interface, too 253 #pylint: disable-msg=W0223 254 255 def LT(self, k): 256 """ 257 Get Token at current input pointer + i ahead where i=1 is next Token. 258 i<0 indicates tokens in the past. So -1 is previous token and -2 is 259 two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken. 260 Return null for LT(0) and any index that results in an absolute address 261 that is negative. 262 """ 263 264 raise NotImplementedError 265 266 267 def range(self): 268 """ 269 How far ahead has the stream been asked to look? The return 270 value is a valid index from 0..n-1. 271 """ 272 273 raise NotImplementedError 274 275 276 def get(self, i): 277 """ 278 Get a token at an absolute index i; 0..n-1. This is really only 279 needed for profiling and debugging and token stream rewriting. 280 If you don't want to buffer up tokens, then this method makes no 281 sense for you. Naturally you can't use the rewrite stream feature. 282 I believe DebugTokenStream can easily be altered to not use 283 this method, removing the dependency. 284 """ 285 286 raise NotImplementedError 287 288 289 def getTokenSource(self): 290 """ 291 Where is this stream pulling tokens from? This is not the name, but 292 the object that provides Token objects. 293 """ 294 295 raise NotImplementedError 296 297 298 def toString(self, start=None, stop=None): 299 """ 300 Return the text of all tokens from start to stop, inclusive. 301 If the stream does not buffer all the tokens then it can just 302 return "" or null; Users should not access $ruleLabel.text in 303 an action of course in that case. 304 305 Because the user is not required to use a token with an index stored 306 in it, we must provide a means for two token objects themselves to 307 indicate the start/end location. Most often this will just delegate 308 to the other toString(int,int). This is also parallel with 309 the TreeNodeStream.toString(Object,Object). 310 """ 311 312 raise NotImplementedError 313 314 315############################################################################ 316# 317# character streams for use in lexers 318# CharStream 319# \- ANTLRStringStream 320# 321############################################################################ 322 323 324class ANTLRStringStream(CharStream): 325 """ 326 @brief CharStream that pull data from a unicode string. 327 328 A pretty quick CharStream that pulls all data from an array 329 directly. Every method call counts in the lexer. 330 331 """ 332 333 334 def __init__(self, data): 335 """ 336 @param data This should be a unicode string holding the data you want 337 to parse. If you pass in a byte string, the Lexer will choke on 338 non-ascii data. 339 340 """ 341 342 CharStream.__init__(self) 343 344 # The data being scanned 345 self.strdata = unicode(data) 346 self.data = [ord(c) for c in self.strdata] 347 348 # How many characters are actually in the buffer 349 self.n = len(data) 350 351 # 0..n-1 index into string of next char 352 self.p = 0 353 354 # line number 1..n within the input 355 self.line = 1 356 357 # The index of the character relative to the beginning of the 358 # line 0..n-1 359 self.charPositionInLine = 0 360 361 # A list of CharStreamState objects that tracks the stream state 362 # values line, charPositionInLine, and p that can change as you 363 # move through the input stream. Indexed from 0..markDepth-1. 364 self._markers = [ ] 365 self.lastMarker = None 366 self.markDepth = 0 367 368 # What is name or source of this char stream? 369 self.name = None 370 371 372 def reset(self): 373 """ 374 Reset the stream so that it's in the same state it was 375 when the object was created *except* the data array is not 376 touched. 377 """ 378 379 self.p = 0 380 self.line = 1 381 self.charPositionInLine = 0 382 self._markers = [ ] 383 384 385 def consume(self): 386 try: 387 if self.data[self.p] == 10: # \n 388 self.line += 1 389 self.charPositionInLine = 0 390 else: 391 self.charPositionInLine += 1 392 393 self.p += 1 394 395 except IndexError: 396 # happend when we reached EOF and self.data[self.p] fails 397 # just do nothing 398 pass 399 400 401 402 def LA(self, i): 403 if i == 0: 404 return 0 # undefined 405 406 if i < 0: 407 i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] 408 409 try: 410 return self.data[self.p+i-1] 411 except IndexError: 412 return EOF 413 414 415 416 def LT(self, i): 417 if i == 0: 418 return 0 # undefined 419 420 if i < 0: 421 i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] 422 423 try: 424 return self.strdata[self.p+i-1] 425 except IndexError: 426 return EOF 427 428 429 def index(self): 430 """ 431 Return the current input symbol index 0..n where n indicates the 432 last symbol has been read. The index is the index of char to 433 be returned from LA(1). 434 """ 435 436 return self.p 437 438 439 def size(self): 440 return self.n 441 442 443 def mark(self): 444 state = (self.p, self.line, self.charPositionInLine) 445 try: 446 self._markers[self.markDepth] = state 447 except IndexError: 448 self._markers.append(state) 449 self.markDepth += 1 450 451 self.lastMarker = self.markDepth 452 453 return self.lastMarker 454 455 456 def rewind(self, marker=None): 457 if marker is None: 458 marker = self.lastMarker 459 460 p, line, charPositionInLine = self._markers[marker-1] 461 462 self.seek(p) 463 self.line = line 464 self.charPositionInLine = charPositionInLine 465 self.release(marker) 466 467 468 def release(self, marker=None): 469 if marker is None: 470 marker = self.lastMarker 471 472 self.markDepth = marker-1 473 474 475 def seek(self, index): 476 """ 477 consume() ahead until p==index; can't just set p=index as we must 478 update line and charPositionInLine. 479 """ 480 481 if index <= self.p: 482 self.p = index # just jump; don't update stream state (line, ...) 483 return 484 485 # seek forward, consume until p hits index 486 while self.p < index: 487 self.consume() 488 489 490 def substring(self, start, stop): 491 return self.strdata[start:stop+1] 492 493 494 def getLine(self): 495 """Using setter/getter methods is deprecated. Use o.line instead.""" 496 return self.line 497 498 499 def getCharPositionInLine(self): 500 """ 501 Using setter/getter methods is deprecated. Use o.charPositionInLine 502 instead. 503 """ 504 return self.charPositionInLine 505 506 507 def setLine(self, line): 508 """Using setter/getter methods is deprecated. Use o.line instead.""" 509 self.line = line 510 511 512 def setCharPositionInLine(self, pos): 513 """ 514 Using setter/getter methods is deprecated. Use o.charPositionInLine 515 instead. 516 """ 517 self.charPositionInLine = pos 518 519 520 def getSourceName(self): 521 return self.name 522 523 524class ANTLRFileStream(ANTLRStringStream): 525 """ 526 @brief CharStream that opens a file to read the data. 527 528 This is a char buffer stream that is loaded from a file 529 all at once when you construct the object. 530 """ 531 532 def __init__(self, fileName, encoding=None): 533 """ 534 @param fileName The path to the file to be opened. The file will be 535 opened with mode 'rb'. 536 537 @param encoding If you set the optional encoding argument, then the 538 data will be decoded on the fly. 539 540 """ 541 542 self.fileName = fileName 543 544 fp = codecs.open(fileName, 'rb', encoding) 545 try: 546 data = fp.read() 547 finally: 548 fp.close() 549 550 ANTLRStringStream.__init__(self, data) 551 552 553 def getSourceName(self): 554 """Deprecated, access o.fileName directly.""" 555 556 return self.fileName 557 558 559class ANTLRInputStream(ANTLRStringStream): 560 """ 561 @brief CharStream that reads data from a file-like object. 562 563 This is a char buffer stream that is loaded from a file like object 564 all at once when you construct the object. 565 566 All input is consumed from the file, but it is not closed. 567 """ 568 569 def __init__(self, file, encoding=None): 570 """ 571 @param file A file-like object holding your input. Only the read() 572 method must be implemented. 573 574 @param encoding If you set the optional encoding argument, then the 575 data will be decoded on the fly. 576 577 """ 578 579 if encoding is not None: 580 # wrap input in a decoding reader 581 reader = codecs.lookup(encoding)[2] 582 file = reader(file) 583 584 data = file.read() 585 586 ANTLRStringStream.__init__(self, data) 587 588 589# I guess the ANTLR prefix exists only to avoid a name clash with some Java 590# mumbojumbo. A plain "StringStream" looks better to me, which should be 591# the preferred name in Python. 592StringStream = ANTLRStringStream 593FileStream = ANTLRFileStream 594InputStream = ANTLRInputStream 595 596 597############################################################################ 598# 599# Token streams 600# TokenStream 601# +- CommonTokenStream 602# \- TokenRewriteStream 603# 604############################################################################ 605 606 607class CommonTokenStream(TokenStream): 608 """ 609 @brief The most common stream of tokens 610 611 The most common stream of tokens is one where every token is buffered up 612 and tokens are prefiltered for a certain channel (the parser will only 613 see these tokens and cannot change the filter channel number during the 614 parse). 615 """ 616 617 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): 618 """ 619 @param tokenSource A TokenSource instance (usually a Lexer) to pull 620 the tokens from. 621 622 @param channel Skip tokens on any channel but this one; this is how we 623 skip whitespace... 624 625 """ 626 627 TokenStream.__init__(self) 628 629 self.tokenSource = tokenSource 630 631 # Record every single token pulled from the source so we can reproduce 632 # chunks of it later. 633 self.tokens = [] 634 635 # Map<tokentype, channel> to override some Tokens' channel numbers 636 self.channelOverrideMap = {} 637 638 # Set<tokentype>; discard any tokens with this type 639 self.discardSet = set() 640 641 # Skip tokens on any channel but this one; this is how we skip 642 # whitespace... 643 self.channel = channel 644 645 # By default, track all incoming tokens 646 self.discardOffChannelTokens = False 647 648 # The index into the tokens list of the current token (next token 649 # to consume). p==-1 indicates that the tokens list is empty 650 self.p = -1 651 652 # Remember last marked position 653 self.lastMarker = None 654 655 # how deep have we gone? 656 self._range = -1 657 658 659 def makeEOFToken(self): 660 return self.tokenSource.makeEOFToken() 661 662 663 def setTokenSource(self, tokenSource): 664 """Reset this token stream by setting its token source.""" 665 666 self.tokenSource = tokenSource 667 self.tokens = [] 668 self.p = -1 669 self.channel = DEFAULT_CHANNEL 670 671 672 def reset(self): 673 self.p = 0 674 self.lastMarker = None 675 676 677 def fillBuffer(self): 678 """ 679 Load all tokens from the token source and put in tokens. 680 This is done upon first LT request because you might want to 681 set some token type / channel overrides before filling buffer. 682 """ 683 684 685 index = 0 686 t = self.tokenSource.nextToken() 687 while t is not None and t.type != EOF: 688 discard = False 689 690 if self.discardSet is not None and t.type in self.discardSet: 691 discard = True 692 693 elif self.discardOffChannelTokens and t.channel != self.channel: 694 discard = True 695 696 # is there a channel override for token type? 697 try: 698 overrideChannel = self.channelOverrideMap[t.type] 699 700 except KeyError: 701 # no override for this type 702 pass 703 704 else: 705 if overrideChannel == self.channel: 706 t.channel = overrideChannel 707 else: 708 discard = True 709 710 if not discard: 711 t.index = index 712 self.tokens.append(t) 713 index += 1 714 715 t = self.tokenSource.nextToken() 716 717 # leave p pointing at first token on channel 718 self.p = 0 719 self.p = self.skipOffTokenChannels(self.p) 720 721 722 def consume(self): 723 """ 724 Move the input pointer to the next incoming token. The stream 725 must become active with LT(1) available. consume() simply 726 moves the input pointer so that LT(1) points at the next 727 input symbol. Consume at least one token. 728 729 Walk past any token not on the channel the parser is listening to. 730 """ 731 732 if self.p < len(self.tokens): 733 self.p += 1 734 735 self.p = self.skipOffTokenChannels(self.p) # leave p on valid token 736 737 738 def skipOffTokenChannels(self, i): 739 """ 740 Given a starting index, return the index of the first on-channel 741 token. 742 """ 743 744 try: 745 while self.tokens[i].channel != self.channel: 746 i += 1 747 except IndexError: 748 # hit the end of token stream 749 pass 750 751 return i 752 753 754 def skipOffTokenChannelsReverse(self, i): 755 while i >= 0 and self.tokens[i].channel != self.channel: 756 i -= 1 757 758 return i 759 760 761 def setTokenTypeChannel(self, ttype, channel): 762 """ 763 A simple filter mechanism whereby you can tell this token stream 764 to force all tokens of type ttype to be on channel. For example, 765 when interpreting, we cannot exec actions so we need to tell 766 the stream to force all WS and NEWLINE to be a different, ignored 767 channel. 768 """ 769 770 self.channelOverrideMap[ttype] = channel 771 772 773 def discardTokenType(self, ttype): 774 self.discardSet.add(ttype) 775 776 777 def getTokens(self, start=None, stop=None, types=None): 778 """ 779 Given a start and stop index, return a list of all tokens in 780 the token type set. Return None if no tokens were found. This 781 method looks at both on and off channel tokens. 782 """ 783 784 if self.p == -1: 785 self.fillBuffer() 786 787 if stop is None or stop >= len(self.tokens): 788 stop = len(self.tokens) - 1 789 790 if start is None or stop < 0: 791 start = 0 792 793 if start > stop: 794 return None 795 796 if isinstance(types, (int, long)): 797 # called with a single type, wrap into set 798 types = set([types]) 799 800 filteredTokens = [ 801 token for token in self.tokens[start:stop] 802 if types is None or token.type in types 803 ] 804 805 if len(filteredTokens) == 0: 806 return None 807 808 return filteredTokens 809 810 811 def LT(self, k): 812 """ 813 Get the ith token from the current position 1..n where k=1 is the 814 first symbol of lookahead. 815 """ 816 817 if self.p == -1: 818 self.fillBuffer() 819 820 if k == 0: 821 return None 822 823 if k < 0: 824 return self.LB(-k) 825 826 i = self.p 827 n = 1 828 # find k good tokens 829 while n < k: 830 # skip off-channel tokens 831 i = self.skipOffTokenChannels(i+1) # leave p on valid token 832 n += 1 833 834 if i > self._range: 835 self._range = i 836 837 try: 838 return self.tokens[i] 839 except IndexError: 840 return self.makeEOFToken() 841 842 843 def LB(self, k): 844 """Look backwards k tokens on-channel tokens""" 845 846 if self.p == -1: 847 self.fillBuffer() 848 849 if k == 0: 850 return None 851 852 if self.p - k < 0: 853 return None 854 855 i = self.p 856 n = 1 857 # find k good tokens looking backwards 858 while n <= k: 859 # skip off-channel tokens 860 i = self.skipOffTokenChannelsReverse(i-1) # leave p on valid token 861 n += 1 862 863 if i < 0: 864 return None 865 866 return self.tokens[i] 867 868 869 def get(self, i): 870 """ 871 Return absolute token i; ignore which channel the tokens are on; 872 that is, count all tokens not just on-channel tokens. 873 """ 874 875 return self.tokens[i] 876 877 878 def slice(self, start, stop): 879 if self.p == -1: 880 self.fillBuffer() 881 882 if start < 0 or stop < 0: 883 return None 884 885 return self.tokens[start:stop+1] 886 887 888 def LA(self, i): 889 return self.LT(i).type 890 891 892 def mark(self): 893 self.lastMarker = self.index() 894 return self.lastMarker 895 896 897 def release(self, marker=None): 898 # no resources to release 899 pass 900 901 902 def size(self): 903 return len(self.tokens) 904 905 906 def range(self): 907 return self._range 908 909 910 def index(self): 911 return self.p 912 913 914 def rewind(self, marker=None): 915 if marker is None: 916 marker = self.lastMarker 917 918 self.seek(marker) 919 920 921 def seek(self, index): 922 self.p = index 923 924 925 def getTokenSource(self): 926 return self.tokenSource 927 928 929 def getSourceName(self): 930 return self.tokenSource.getSourceName() 931 932 933 def toString(self, start=None, stop=None): 934 if self.p == -1: 935 self.fillBuffer() 936 937 if start is None: 938 start = 0 939 elif not isinstance(start, int): 940 start = start.index 941 942 if stop is None: 943 stop = len(self.tokens) - 1 944 elif not isinstance(stop, int): 945 stop = stop.index 946 947 if stop >= len(self.tokens): 948 stop = len(self.tokens) - 1 949 950 return ''.join([t.text for t in self.tokens[start:stop+1]]) 951 952 953class RewriteOperation(object): 954 """@brief Internal helper class.""" 955 956 def __init__(self, stream, index, text): 957 self.stream = stream 958 959 # What index into rewrites List are we? 960 self.instructionIndex = None 961 962 # Token buffer index. 963 self.index = index 964 self.text = text 965 966 def execute(self, buf): 967 """Execute the rewrite operation by possibly adding to the buffer. 968 Return the index of the next token to operate on. 969 """ 970 971 return self.index 972 973 def toString(self): 974 opName = self.__class__.__name__ 975 return '<%s@%d:"%s">' % ( 976 opName, self.index, self.text) 977 978 __str__ = toString 979 __repr__ = toString 980 981 982class InsertBeforeOp(RewriteOperation): 983 """@brief Internal helper class.""" 984 985 def execute(self, buf): 986 buf.write(self.text) 987 if self.stream.tokens[self.index].type != EOF: 988 buf.write(self.stream.tokens[self.index].text) 989 return self.index + 1 990 991 992class ReplaceOp(RewriteOperation): 993 """ 994 @brief Internal helper class. 995 996 I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp 997 instructions. 998 """ 999 1000 def __init__(self, stream, first, last, text): 1001 RewriteOperation.__init__(self, stream, first, text) 1002 self.lastIndex = last 1003 1004 1005 def execute(self, buf): 1006 if self.text is not None: 1007 buf.write(self.text) 1008 1009 return self.lastIndex + 1 1010 1011 1012 def toString(self): 1013 if self.text is None: 1014 return '<DeleteOp@%d..%d>' % (self.index, self.lastIndex) 1015 1016 return '<ReplaceOp@%d..%d:"%s">' % ( 1017 self.index, self.lastIndex, self.text) 1018 1019 __str__ = toString 1020 __repr__ = toString 1021 1022 1023class TokenRewriteStream(CommonTokenStream): 1024 """@brief CommonTokenStream that can be modified. 1025 1026 Useful for dumping out the input stream after doing some 1027 augmentation or other manipulations. 1028 1029 You can insert stuff, replace, and delete chunks. Note that the 1030 operations are done lazily--only if you convert the buffer to a 1031 String. This is very efficient because you are not moving data around 1032 all the time. As the buffer of tokens is converted to strings, the 1033 toString() method(s) check to see if there is an operation at the 1034 current index. If so, the operation is done and then normal String 1035 rendering continues on the buffer. This is like having multiple Turing 1036 machine instruction streams (programs) operating on a single input tape. :) 1037 1038 Since the operations are done lazily at toString-time, operations do not 1039 screw up the token index values. That is, an insert operation at token 1040 index i does not change the index values for tokens i+1..n-1. 1041 1042 Because operations never actually alter the buffer, you may always get 1043 the original token stream back without undoing anything. Since 1044 the instructions are queued up, you can easily simulate transactions and 1045 roll back any changes if there is an error just by removing instructions. 1046 For example, 1047 1048 CharStream input = new ANTLRFileStream("input"); 1049 TLexer lex = new TLexer(input); 1050 TokenRewriteStream tokens = new TokenRewriteStream(lex); 1051 T parser = new T(tokens); 1052 parser.startRule(); 1053 1054 Then in the rules, you can execute 1055 Token t,u; 1056 ... 1057 input.insertAfter(t, "text to put after t");} 1058 input.insertAfter(u, "text after u");} 1059 System.out.println(tokens.toString()); 1060 1061 Actually, you have to cast the 'input' to a TokenRewriteStream. :( 1062 1063 You can also have multiple "instruction streams" and get multiple 1064 rewrites from a single pass over the input. Just name the instruction 1065 streams and use that name again when printing the buffer. This could be 1066 useful for generating a C file and also its header file--all from the 1067 same buffer: 1068 1069 tokens.insertAfter("pass1", t, "text to put after t");} 1070 tokens.insertAfter("pass2", u, "text after u");} 1071 System.out.println(tokens.toString("pass1")); 1072 System.out.println(tokens.toString("pass2")); 1073 1074 If you don't use named rewrite streams, a "default" stream is used as 1075 the first example shows. 1076 """ 1077 1078 DEFAULT_PROGRAM_NAME = "default" 1079 MIN_TOKEN_INDEX = 0 1080 1081 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): 1082 CommonTokenStream.__init__(self, tokenSource, channel) 1083 1084 # You may have multiple, named streams of rewrite operations. 1085 # I'm calling these things "programs." 1086 # Maps String (name) -> rewrite (List) 1087 self.programs = {} 1088 self.programs[self.DEFAULT_PROGRAM_NAME] = [] 1089 1090 # Map String (program name) -> Integer index 1091 self.lastRewriteTokenIndexes = {} 1092 1093 1094 def rollback(self, *args): 1095 """ 1096 Rollback the instruction stream for a program so that 1097 the indicated instruction (via instructionIndex) is no 1098 longer in the stream. UNTESTED! 1099 """ 1100 1101 if len(args) == 2: 1102 programName = args[0] 1103 instructionIndex = args[1] 1104 elif len(args) == 1: 1105 programName = self.DEFAULT_PROGRAM_NAME 1106 instructionIndex = args[0] 1107 else: 1108 raise TypeError("Invalid arguments") 1109 1110 p = self.programs.get(programName, None) 1111 if p is not None: 1112 self.programs[programName] = ( 1113 p[self.MIN_TOKEN_INDEX:instructionIndex]) 1114 1115 1116 def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME): 1117 """Reset the program so that no instructions exist""" 1118 1119 self.rollback(programName, self.MIN_TOKEN_INDEX) 1120 1121 1122 def insertAfter(self, *args): 1123 if len(args) == 2: 1124 programName = self.DEFAULT_PROGRAM_NAME 1125 index = args[0] 1126 text = args[1] 1127 1128 elif len(args) == 3: 1129 programName = args[0] 1130 index = args[1] 1131 text = args[2] 1132 1133 else: 1134 raise TypeError("Invalid arguments") 1135 1136 if isinstance(index, Token): 1137 # index is a Token, grap the stream index from it 1138 index = index.index 1139 1140 # to insert after, just insert before next index (even if past end) 1141 self.insertBefore(programName, index+1, text) 1142 1143 1144 def insertBefore(self, *args): 1145 if len(args) == 2: 1146 programName = self.DEFAULT_PROGRAM_NAME 1147 index = args[0] 1148 text = args[1] 1149 1150 elif len(args) == 3: 1151 programName = args[0] 1152 index = args[1] 1153 text = args[2] 1154 1155 else: 1156 raise TypeError("Invalid arguments") 1157 1158 if isinstance(index, Token): 1159 # index is a Token, grap the stream index from it 1160 index = index.index 1161 1162 op = InsertBeforeOp(self, index, text) 1163 rewrites = self.getProgram(programName) 1164 op.instructionIndex = len(rewrites) 1165 rewrites.append(op) 1166 1167 1168 def replace(self, *args): 1169 if len(args) == 2: 1170 programName = self.DEFAULT_PROGRAM_NAME 1171 first = args[0] 1172 last = args[0] 1173 text = args[1] 1174 1175 elif len(args) == 3: 1176 programName = self.DEFAULT_PROGRAM_NAME 1177 first = args[0] 1178 last = args[1] 1179 text = args[2] 1180 1181 elif len(args) == 4: 1182 programName = args[0] 1183 first = args[1] 1184 last = args[2] 1185 text = args[3] 1186 1187 else: 1188 raise TypeError("Invalid arguments") 1189 1190 if isinstance(first, Token): 1191 # first is a Token, grap the stream index from it 1192 first = first.index 1193 1194 if isinstance(last, Token): 1195 # last is a Token, grap the stream index from it 1196 last = last.index 1197 1198 if first > last or first < 0 or last < 0 or last >= len(self.tokens): 1199 raise ValueError( 1200 "replace: range invalid: %d..%d (size=%d)" 1201 % (first, last, len(self.tokens))) 1202 1203 op = ReplaceOp(self, first, last, text) 1204 rewrites = self.getProgram(programName) 1205 op.instructionIndex = len(rewrites) 1206 rewrites.append(op) 1207 1208 1209 def delete(self, *args): 1210 self.replace(*(list(args) + [None])) 1211 1212 1213 def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME): 1214 return self.lastRewriteTokenIndexes.get(programName, -1) 1215 1216 1217 def setLastRewriteTokenIndex(self, programName, i): 1218 self.lastRewriteTokenIndexes[programName] = i 1219 1220 1221 def getProgram(self, name): 1222 p = self.programs.get(name, None) 1223 if p is None: 1224 p = self.initializeProgram(name) 1225 1226 return p 1227 1228 1229 def initializeProgram(self, name): 1230 p = [] 1231 self.programs[name] = p 1232 return p 1233 1234 1235 def toOriginalString(self, start=None, end=None): 1236 if self.p == -1: 1237 self.fillBuffer() 1238 1239 if start is None: 1240 start = self.MIN_TOKEN_INDEX 1241 if end is None: 1242 end = self.size() - 1 1243 1244 buf = StringIO() 1245 i = start 1246 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): 1247 if self.get(i).type != EOF: 1248 buf.write(self.get(i).text) 1249 i += 1 1250 1251 return buf.getvalue() 1252 1253 1254 def toString(self, *args): 1255 if self.p == -1: 1256 self.fillBuffer() 1257 1258 if len(args) == 0: 1259 programName = self.DEFAULT_PROGRAM_NAME 1260 start = self.MIN_TOKEN_INDEX 1261 end = self.size() - 1 1262 1263 elif len(args) == 1: 1264 programName = args[0] 1265 start = self.MIN_TOKEN_INDEX 1266 end = self.size() - 1 1267 1268 elif len(args) == 2: 1269 programName = self.DEFAULT_PROGRAM_NAME 1270 start = args[0] 1271 end = args[1] 1272 1273 if start is None: 1274 start = self.MIN_TOKEN_INDEX 1275 elif not isinstance(start, int): 1276 start = start.index 1277 1278 if end is None: 1279 end = len(self.tokens) - 1 1280 elif not isinstance(end, int): 1281 end = end.index 1282 1283 # ensure start/end are in range 1284 if end >= len(self.tokens): 1285 end = len(self.tokens) - 1 1286 1287 if start < 0: 1288 start = 0 1289 1290 rewrites = self.programs.get(programName) 1291 if rewrites is None or len(rewrites) == 0: 1292 # no instructions to execute 1293 return self.toOriginalString(start, end) 1294 1295 buf = StringIO() 1296 1297 # First, optimize instruction stream 1298 indexToOp = self.reduceToSingleOperationPerIndex(rewrites) 1299 1300 # Walk buffer, executing instructions and emitting tokens 1301 i = start 1302 while i <= end and i < len(self.tokens): 1303 op = indexToOp.get(i) 1304 # remove so any left have index size-1 1305 try: 1306 del indexToOp[i] 1307 except KeyError: 1308 pass 1309 1310 t = self.tokens[i] 1311 if op is None: 1312 # no operation at that index, just dump token 1313 if t.type != EOF: 1314 buf.write(t.text) 1315 i += 1 # move to next token 1316 1317 else: 1318 i = op.execute(buf) # execute operation and skip 1319 1320 # include stuff after end if it's last index in buffer 1321 # So, if they did an insertAfter(lastValidIndex, "foo"), include 1322 # foo if end==lastValidIndex. 1323 if end == len(self.tokens) - 1: 1324 # Scan any remaining operations after last token 1325 # should be included (they will be inserts). 1326 for i in sorted(indexToOp.keys()): 1327 op = indexToOp[i] 1328 if op.index >= len(self.tokens)-1: 1329 buf.write(op.text) 1330 1331 return buf.getvalue() 1332 1333 __str__ = toString 1334 1335 1336 def reduceToSingleOperationPerIndex(self, rewrites): 1337 """ 1338 We need to combine operations and report invalid operations (like 1339 overlapping replaces that are not completed nested). Inserts to 1340 same index need to be combined etc... Here are the cases: 1341 1342 I.i.u I.j.v leave alone, nonoverlapping 1343 I.i.u I.i.v combine: Iivu 1344 1345 R.i-j.u R.x-y.v | i-j in x-y delete first R 1346 R.i-j.u R.i-j.v delete first R 1347 R.i-j.u R.x-y.v | x-y in i-j ERROR 1348 R.i-j.u R.x-y.v | boundaries overlap ERROR 1349 1350 Delete special case of replace (text==null): 1351 D.i-j.u D.x-y.v | boundaries overlapcombine to 1352 max(min)..max(right) 1353 1354 I.i.u R.x-y.v | i in (x+1)-ydelete I (since 1355 insert before we're not deleting 1356 i) 1357 I.i.u R.x-y.v | i not in (x+1)-yleave alone, 1358 nonoverlapping 1359 1360 R.x-y.v I.i.u | i in x-y ERROR 1361 R.x-y.v I.x.u R.x-y.uv (combine, delete I) 1362 R.x-y.v I.i.u | i not in x-y leave alone, nonoverlapping 1363 1364 I.i.u = insert u before op @ index i 1365 R.x-y.u = replace x-y indexed tokens with u 1366 1367 First we need to examine replaces. For any replace op: 1368 1369 1. wipe out any insertions before op within that range. 1370 2. Drop any replace op before that is contained completely within 1371 that range. 1372 3. Throw exception upon boundary overlap with any previous replace. 1373 1374 Then we can deal with inserts: 1375 1376 1. for any inserts to same index, combine even if not adjacent. 1377 2. for any prior replace with same left boundary, combine this 1378 insert with replace and delete this replace. 1379 3. throw exception if index in same range as previous replace 1380 1381 Don't actually delete; make op null in list. Easier to walk list. 1382 Later we can throw as we add to index -> op map. 1383 1384 Note that I.2 R.2-2 will wipe out I.2 even though, technically, the 1385 inserted stuff would be before the replace range. But, if you 1386 add tokens in front of a method body '{' and then delete the method 1387 body, I think the stuff before the '{' you added should disappear too. 1388 1389 Return a map from token index to operation. 1390 """ 1391 1392 # WALK REPLACES 1393 for i, rop in enumerate(rewrites): 1394 if rop is None: 1395 continue 1396 1397 if not isinstance(rop, ReplaceOp): 1398 continue 1399 1400 # Wipe prior inserts within range 1401 for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i): 1402 if iop.index == rop.index: 1403 # E.g., insert before 2, delete 2..2; update replace 1404 # text to include insert before, kill insert 1405 rewrites[iop.instructionIndex] = None 1406 rop.text = self.catOpText(iop.text, rop.text) 1407 1408 elif iop.index > rop.index and iop.index <= rop.lastIndex: 1409 # delete insert as it's a no-op. 1410 rewrites[j] = None 1411 1412 # Drop any prior replaces contained within 1413 for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i): 1414 if (prevRop.index >= rop.index 1415 and prevRop.lastIndex <= rop.lastIndex): 1416 # delete replace as it's a no-op. 1417 rewrites[j] = None 1418 continue 1419 1420 # throw exception unless disjoint or identical 1421 disjoint = (prevRop.lastIndex < rop.index 1422 or prevRop.index > rop.lastIndex) 1423 same = (prevRop.index == rop.index 1424 and prevRop.lastIndex == rop.lastIndex) 1425 1426 # Delete special case of replace (text==null): 1427 # D.i-j.u D.x-y.v| boundaries overlapcombine to 1428 # max(min)..max(right) 1429 if prevRop.text is None and rop.text is None and not disjoint: 1430 # kill first delete 1431 rewrites[prevRop.instructionIndex] = None 1432 1433 rop.index = min(prevRop.index, rop.index) 1434 rop.lastIndex = max(prevRop.lastIndex, rop.lastIndex) 1435 1436 elif not disjoint and not same: 1437 raise ValueError( 1438 "replace op boundaries of %s overlap with previous %s" 1439 % (rop, prevRop)) 1440 1441 # WALK INSERTS 1442 for i, iop in enumerate(rewrites): 1443 if iop is None: 1444 continue 1445 1446 if not isinstance(iop, InsertBeforeOp): 1447 continue 1448 1449 # combine current insert with prior if any at same index 1450 for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i): 1451 if prevIop.index == iop.index: # combine objects 1452 # convert to strings...we're in process of toString'ing 1453 # whole token buffer so no lazy eval issue with any 1454 # templates 1455 iop.text = self.catOpText(iop.text, prevIop.text) 1456 # delete redundant prior insert 1457 rewrites[j] = None 1458 1459 # look for replaces where iop.index is in range; error 1460 for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i): 1461 if iop.index == rop.index: 1462 rop.text = self.catOpText(iop.text, rop.text) 1463 # delete current insert 1464 rewrites[i] = None 1465 continue 1466 1467 if iop.index >= rop.index and iop.index <= rop.lastIndex: 1468 raise ValueError( 1469 "insert op %s within boundaries of previous %s" 1470 % (iop, rop)) 1471 1472 m = {} 1473 for i, op in enumerate(rewrites): 1474 if op is None: 1475 # ignore deleted ops 1476 continue 1477 1478 assert op.index not in m, "should only be one op per index" 1479 m[op.index] = op 1480 1481 return m 1482 1483 1484 def catOpText(self, a, b): 1485 x = "" 1486 y = "" 1487 if a is not None: 1488 x = a 1489 if b is not None: 1490 y = b 1491 return x + y 1492 1493 1494 def getKindOfOps(self, rewrites, kind, before=None): 1495 """Get all operations before an index of a particular kind.""" 1496 1497 if before is None: 1498 before = len(rewrites) 1499 elif before > len(rewrites): 1500 before = len(rewrites) 1501 1502 for i, op in enumerate(rewrites[:before]): 1503 if op is None: 1504 # ignore deleted 1505 continue 1506 if op.__class__ == kind: 1507 yield i, op 1508 1509 1510 def toDebugString(self, start=None, end=None): 1511 if start is None: 1512 start = self.MIN_TOKEN_INDEX 1513 if end is None: 1514 end = self.size() - 1 1515 1516 buf = StringIO() 1517 i = start 1518 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): 1519 buf.write(self.get(i)) 1520 i += 1 1521 1522 return buf.getvalue() 1523