rhino-python.extensions revision 324c4644fee44b9898524c09511bd33c3f12e2df
1/*
2 [The "BSD licence"]
3 Copyright (c) 2004 Terence Parr and Loring Craymer
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions
8 are met:
9 1. Redistributions of source code must retain the above copyright
10    notice, this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 3. The name of the author may not be used to endorse or promote products
15    derived from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29
30/** Python does not explicitly provide begin and end nesting signals.
31 Rather, the indentation level indicates when you begin and end.
32 This is an interesting lexical problem because multiple DEDENT
33 tokens should be sent to the parser sometimes without a corresponding
34 input symbol!  Consider the following example:
35
36 a=1
37 if a>1:
38     print a
39 b=3
40
41 Here the "b" token on the left edge signals that a DEDENT is needed
42 after the "print a \n" and before the "b".  The sequence should be
43
44 ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...
45
46 For more examples, see the big comment at the bottom of this file.
47
48 This TokenStream normally just passes tokens through to the parser.
49 Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
50 may need to be sent to the parser.  The NEWLINE is the trigger for
51 this class to do it's job.  NEWLINE is saved and then the first token
52 of the next line is examined.  If non-leading-whitespace token,
53 then check against stack for indent vs dedent.  If LEADING_WS, then
54 the column of the next non-whitespace token will dictate indent vs
55 dedent.  The column of the next real token is number of spaces
56 in the LEADING_WS token + 1 (to move past the whitespace).  The
57 lexer grammar must set the text of the LEADING_WS token to be
58 the proper number of spaces (and do tab conversion etc...).
59
60 A stack of column numbers is tracked and used to detect changes
61 in indent level from one token to the next.
62
63 A queue of tokens is built up to hold multiple DEDENT tokens that
64 are generated.  Before asking the lexer for another token via
65 nextToken(), the queue is flushed first one token at a time.
66
67 Terence Parr and Loring Craymer
68 February 2004
69 */
70PythonTokenSource = function(stream) {
71    this.stream = stream;
72	/** The stack of indent levels (column numbers) */
73	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
74	/** stack pointer */
75	this.sp=-1; // grow upwards
76
77	/** The queue of tokens */
78	this.tokens = [];
79	this.lastTokenAddedIndex = -1;
80	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
81};
82
83ANTLR.lang.augmentObject(PythonTokenSource, {
84	MAX_INDENTS: 100,
85	FIRST_CHAR_POSITION: 0,
86});
87
88PythonTokenSource.prototype = {
89	getSourceName: function() {
90		return this.stream.getSourceName();
91	},
92
93	/** From http://www.python.org/doc/2.2.3/ref/indentation.html
94
95	 "Before the first line of the file is read, a single zero is
96	 pushed on the stack; this will never be popped off again. The
97	 numbers pushed on the stack will always be strictly increasing
98	 from bottom to top. At the beginning of each logical line, the
99	 line's indentation level is compared to the top of the
100	 stack. If it is equal, nothing happens. If it is larger, it is
101	 pushed on the stack, and one INDENT token is generated. If it
102	 is smaller, it must be one of the numbers occurring on the
103	 stack; all numbers on the stack that are larger are popped
104	 off, and for each number popped off a DEDENT token is
105	 generated. At the end of the file, a DEDENT token is generated
106	 for each number remaining on the stack that is larger than
107	 zero."
108
109	 I use char position in line 0..n-1 instead.
110
111	 The DEDENTS possibly needed at EOF are gracefully handled by forcing
112	 EOF to have char pos 0 even though with UNIX it's hard to get EOF
113	 at a non left edge.
114	 */
115	nextToken: function() {
116		// if something in queue, just remove and return it
117		if (this.tokens.length>0 ) {
118			var t = this.tokens[0];
119			this.tokens.splice(0,1);
120			return t;
121		}
122
123		this.insertImaginaryIndentDedentTokens();
124
125		return this.nextToken();
126	},
127
128	insertImaginaryIndentDedentTokens: function()
129	{
130		var t = this.stream.LT(1);
131		this.stream.consume();
132
133		// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
134		if ( t.getType()!=PythonLexer.NEWLINE ) {
135			var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
136			if ( hiddenTokens!=null ) {
137				this.tokens = this.tokens.concat(hiddenTokens);
138			}
139			this.lastTokenAddedIndex = t.getTokenIndex();
140			this.tokens.push(t);
141			return;
142		}
143
144		// save NEWLINE in the queue
145		var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
146		if ( hiddenTokens!=null ) {
147			this.tokens = this.tokens.concat(hiddenTokens);
148		}
149		this.lastTokenAddedIndex = t.getTokenIndex();
150		this.tokens.push(t);
151
152		// grab first token of next line
153		t = this.stream.LT(1);
154		this.stream.consume();
155
156		hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
157		if ( hiddenTokens!=null ) {
158			this.tokens = this.tokens.concat(hiddenTokens);
159		}
160		this.lastTokenAddedIndex = t.getTokenIndex();
161
162		// compute cpos as the char pos of next non-WS token in line
163		var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
164		if ( t.getType()==ANTLR.runtime.Token.EOF ) {
165			cpos = -1; // pretend EOF always happens at left edge
166		}
167		else if ( t.getType()==PythonLexer.LEADING_WS ) {
168			cpos = t.getText().length;
169		}
170
171		// compare to last indent level
172		var lastIndent = this.peek();
173		if ( cpos > lastIndent ) { // they indented; track and gen INDENT
174			this.push(cpos);
175			var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
176			indent.setCharPositionInLine(t.getCharPositionInLine());
177			indent.setLine(t.getLine());
178			this.tokens.push(indent);
179		}
180		else if ( cpos < lastIndent ) { // they dedented
181			// how far back did we dedent?
182			var prevIndex = this.findPreviousIndent(cpos);
183			// generate DEDENTs for each indent level we backed up over
184			for (var d=this.sp-1; d>=prevIndex; d--) {
185				var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
186				dedent.setCharPositionInLine(t.getCharPositionInLine());
187				dedent.setLine(t.getLine());
188				this.tokens.push(dedent);
189			}
190			this.sp = prevIndex; // pop those off indent level
191		}
192		if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
193			this.tokens.push(t);
194		}
195	},
196
197	//  T O K E N  S T A C K  M E T H O D S
198
199	push: function(i) {
200		if (this.sp>=PythonTokenSource.MAX_INDENTS) {
201			throw new Error("stack overflow");
202		}
203		this.sp++;
204		this.indentStack[this.sp] = i;
205	},
206
207	pop: function() {
208		if (this.sp<0) {
209			throw new Error("stack underflow");
210		}
211		var top = this.indentStack[this.sp];
212		this.sp--;
213		return top;
214	},
215
216	peek: function() {
217		return this.indentStack[this.sp];
218	},
219
220	/** Return the index on stack of previous indent level == i else -1 */
221	findPreviousIndent: function(i) {
222		for (var j=this.sp-1; j>=0; j--) {
223			if (this.indentStack[j]==i ) {
224				return j;
225			}
226		}
227		return PythonTokenSource.FIRST_CHAR_POSITION;
228	},
229
230	stackString: function() {
231		var buf = [];
232		for (var j=this.sp; j>=0; j--) {
233			buf.push(this.indentStack[j]);
234		}
235		return buf.join(" ");
236	}
237
238}
239
240/* More example input / output pairs with code simplified to single chars
241------- t1 -------
242a a
243        b b
244        c
245d
246a a \n INDENT b b \n c \n DEDENT d \n EOF
247------- t2 -------
248a  c
249 b
250c
251a c \n INDENT b \n DEDENT c \n EOF
252------- t3 -------
253a
254        b
255                c
256d
257a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF
258------- t4 -------
259a
260    c
261                  d
262    e
263    f
264             g
265             h
266             i
267              j
268    k
269a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF
270------- t5 -------
271a
272        b
273        c
274                d
275                e
276a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF
277*/
278