1/*
2 [The 'BSD licence']
3 Copyright (c) 2004 Terence Parr and Loring Craymer
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions
8 are met:
9 1. Redistributions of source code must retain the above copyright
10    notice, this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 3. The name of the author may not be used to endorse or promote products
15    derived from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/** Python 2.3.3 Grammar
30 *
31 *  Terence Parr and Loring Craymer
32 *  February 2004
33 *
34 *  Converted to ANTLR v3 November 2005 by Terence Parr.
35 *
36 *  This grammar was derived automatically from the Python 2.3.3
37 *  parser grammar to get a syntactically correct ANTLR grammar
38 *  for Python.  Then Terence hand tweaked it to be semantically
39 *  correct; i.e., removed lookahead issues etc...  It is LL(1)
40 *  except for the (sometimes optional) trailing commas and semi-colons.
41 *  It needs two symbols of lookahead in this case.
42 *
43 *  Starting with Loring's preliminary lexer for Python, I modified it
44 *  to do my version of the whole nasty INDENT/DEDENT issue just so I
45 *  could understand the problem better.  This grammar requires
46 *  PythonTokenStream.java to work.  Also I used some rules from the
47 *  semi-formal grammar on the web for Python (automatically
48 *  translated to ANTLR format by an ANTLR grammar, naturally <grin>).
49 *  The lexical rules for python are particularly nasty and it took me
50 *  a long time to get it 'right'; i.e., think about it in the proper
51 *  way.  Resist changing the lexer unless you've used ANTLR a lot. ;)
52 *
53 *  I (Terence) tested this by running it on the jython-2.1/Lib
54 *  directory of 40k lines of Python.
55 *
56 *  REQUIRES ANTLR v3
57 */
58grammar Python;
59options {language=JavaScript;}
60
61tokens {
62    INDENT;
63    DEDENT;
64}
65
66@lexer::members {
67/** Handles context-sensitive lexing of implicit line joining such as
68 *  the case where newline is ignored in cases like this:
69 *  a = [3,
70 *       4]
71 */
72	this.implicitLineJoiningLevel= 0;
73	this.startPos = -1;
74}
75
76single_input
77    : NEWLINE
78	| simple_stmt
79	| compound_stmt NEWLINE
80	;
81
82file_input
83    :   (NEWLINE | stmt)*
84	;
85
86eval_input
87    :   (NEWLINE)* testlist (NEWLINE)*
88	;
89
90funcdef
91    :   'def' NAME parameters COLON suite
92	{xlog("found method def "+$NAME.text);}
93	;
94
95parameters
96    :   LPAREN (varargslist)? RPAREN
97	;
98
99varargslist
100    :   defparameter (options {greedy=true;}:COMMA defparameter)*
101        (COMMA
102            ( STAR NAME (COMMA DOUBLESTAR NAME)?
103            | DOUBLESTAR NAME
104            )?
105        )?
106    |   STAR NAME (COMMA DOUBLESTAR NAME)?
107    |   DOUBLESTAR NAME
108    ;
109
110defparameter
111    :   fpdef (ASSIGN test)?
112    ;
113
114fpdef
115    :   NAME
116	|   LPAREN fplist RPAREN
117	;
118
119fplist
120    :   fpdef (options {greedy=true;}:COMMA fpdef)* (COMMA)?
121	;
122
123
124stmt: simple_stmt
125	| compound_stmt
126	;
127
128simple_stmt
129    :   small_stmt (options {greedy=true;}:SEMI small_stmt)* (SEMI)? NEWLINE
130	;
131
132small_stmt: expr_stmt
133	| print_stmt
134	| del_stmt
135	| pass_stmt
136	| flow_stmt
137	| import_stmt
138	| global_stmt
139	| exec_stmt
140	| assert_stmt
141	;
142
143expr_stmt
144	:	testlist
145		(	augassign testlist
146		|	(ASSIGN testlist)+
147		)?
148	;
149
150augassign
151    : PLUSEQUAL
152	| MINUSEQUAL
153	| STAREQUAL
154	| SLASHEQUAL
155	| PERCENTEQUAL
156	| AMPEREQUAL
157	| VBAREQUAL
158	| CIRCUMFLEXEQUAL
159	| LEFTSHIFTEQUAL
160	| RIGHTSHIFTEQUAL
161	| DOUBLESTAREQUAL
162	| DOUBLESLASHEQUAL
163	;
164
165print_stmt:
166        'print'
167        (   testlist
168        |   RIGHTSHIFT testlist
169        )?
170	;
171
172del_stmt: 'del' exprlist
173	;
174
175pass_stmt: 'pass'
176	;
177
178flow_stmt: break_stmt
179	| continue_stmt
180	| return_stmt
181	| raise_stmt
182	| yield_stmt
183	;
184
185break_stmt: 'break'
186	;
187
188continue_stmt: 'continue'
189	;
190
191return_stmt: 'return' (testlist)?
192	;
193
194yield_stmt: 'yield' testlist
195	;
196
197raise_stmt: 'raise' (test (COMMA test (COMMA test)?)?)?
198	;
199
200import_stmt
201    :   'import' dotted_as_name (COMMA dotted_as_name)*
202	|   'from' dotted_name 'import'
203        (STAR | import_as_name (COMMA import_as_name)*)
204	;
205
206import_as_name
207    :   NAME (NAME NAME)?
208	;
209
210dotted_as_name: dotted_name (NAME NAME)?
211	;
212
213dotted_name: NAME (DOT NAME)*
214	;
215
216global_stmt: 'global' NAME (COMMA NAME)*
217	;
218
219exec_stmt: 'exec' expr ('in' test (COMMA test)?)?
220	;
221
222assert_stmt: 'assert' test (COMMA test)?
223	;
224
225
226compound_stmt: if_stmt
227	| while_stmt
228	| for_stmt
229	| try_stmt
230	| funcdef
231	| classdef
232	;
233
234if_stmt: 'if' test COLON suite ('elif' test COLON suite)* ('else' COLON suite)?
235	;
236
237while_stmt: 'while' test COLON suite ('else' COLON suite)?
238	;
239
240for_stmt: 'for' exprlist 'in' testlist COLON suite ('else' COLON suite)?
241	;
242
243try_stmt
244    :   'try' COLON suite
245        (   (except_clause COLON suite)+ ('else' COLON suite)?
246        |   'finally' COLON suite
247        )
248	;
249
250except_clause: 'except' (test (COMMA test)?)?
251	;
252
253suite: simple_stmt
254	| NEWLINE INDENT (stmt)+ DEDENT
255	;
256
257
258test: and_test ('or' and_test)*
259	| lambdef
260	;
261
262and_test
263	: not_test ('and' not_test)*
264	;
265
266not_test
267	: 'not' not_test
268	| comparison
269	;
270
271comparison: expr (comp_op expr)*
272	;
273
274comp_op: LESS
275	|GREATER
276	|EQUAL
277	|GREATEREQUAL
278	|LESSEQUAL
279	|ALT_NOTEQUAL
280	|NOTEQUAL
281	|'in'
282	|'not' 'in'
283	|'is'
284	|'is' 'not'
285	;
286
287expr: xor_expr (VBAR xor_expr)*
288	;
289
290xor_expr: and_expr (CIRCUMFLEX and_expr)*
291	;
292
293and_expr: shift_expr (AMPER shift_expr)*
294	;
295
296shift_expr: arith_expr ((LEFTSHIFT|RIGHTSHIFT) arith_expr)*
297	;
298
299arith_expr: term ((PLUS|MINUS) term)*
300	;
301
302term: factor ((STAR | SLASH | PERCENT | DOUBLESLASH ) factor)*
303	;
304
305factor
306	: (PLUS|MINUS|TILDE) factor
307	| power
308	;
309
310power
311	:   atom (trailer)* (options {greedy=true;}:DOUBLESTAR factor)?
312	;
313
314atom: LPAREN (testlist)? RPAREN
315	| LBRACK (listmaker)? RBRACK
316	| LCURLY (dictmaker)? RCURLY
317	| BACKQUOTE testlist BACKQUOTE
318	| NAME
319	| INT
320    | LONGINT
321    | FLOAT
322    | COMPLEX
323	| (STRING)+
324	;
325
326listmaker: test ( list_for | (options {greedy=true;}:COMMA test)* ) (COMMA)?
327	;
328
329lambdef: 'lambda' (varargslist)? COLON test
330	;
331
332trailer: LPAREN (arglist)? RPAREN
333	| LBRACK subscriptlist RBRACK
334	| DOT NAME
335	;
336
337subscriptlist
338    :   subscript (options {greedy=true;}:COMMA subscript)* (COMMA)?
339	;
340
341subscript
342	: DOT DOT DOT
343    | test (COLON (test)? (sliceop)?)?
344    | COLON (test)? (sliceop)?
345    ;
346
347sliceop: COLON (test)?
348	;
349
350exprlist
351    :   expr (options {k=2;}:COMMA expr)* (COMMA)?
352	;
353
354testlist
355    :   test (options {k=2;}: COMMA test)* (COMMA)?
356    ;
357
358dictmaker
359    :   test COLON test
360        (options {k=2;}:COMMA test COLON test)* (COMMA)?
361    ;
362
363classdef: 'class' NAME (LPAREN testlist RPAREN)? COLON suite
364	{xlog("found class def "+$NAME.text);}
365	;
366
367arglist: argument (COMMA argument)*
368        ( COMMA
369          ( STAR test (COMMA DOUBLESTAR test)?
370          | DOUBLESTAR test
371          )?
372        )?
373    |   STAR test (COMMA DOUBLESTAR test)?
374    |   DOUBLESTAR test
375    ;
376
377argument : test (ASSIGN test)?
378         ;
379
380list_iter: list_for
381	| list_if
382	;
383
384list_for: 'for' exprlist 'in' testlist (list_iter)?
385	;
386
387list_if: 'if' test (list_iter)?
388	;
389
390LPAREN	: '(' {this.implicitLineJoiningLevel++;} ;
391
392RPAREN	: ')' {this.implicitLineJoiningLevel--;} ;
393
394LBRACK	: '[' {this.implicitLineJoiningLevel++;} ;
395
396RBRACK	: ']' {this.implicitLineJoiningLevel--;} ;
397
398COLON 	: ':' ;
399
400COMMA	: ',' ;
401
402SEMI	: ';' ;
403
404PLUS	: '+' ;
405
406MINUS	: '-' ;
407
408STAR	: '*' ;
409
410SLASH	: '/' ;
411
412VBAR	: '|' ;
413
414AMPER	: '&' ;
415
416LESS	: '<' ;
417
418GREATER	: '>' ;
419
420ASSIGN	: '=' ;
421
422PERCENT	: '%' ;
423
424BACKQUOTE	: '`' ;
425
426LCURLY	: '{' {this.implicitLineJoiningLevel++;} ;
427
428RCURLY	: '}' {this.implicitLineJoiningLevel--;} ;
429
430CIRCUMFLEX	: '^' ;
431
432TILDE	: '~' ;
433
434EQUAL	: '==' ;
435
436NOTEQUAL	: '!=' ;
437
438ALT_NOTEQUAL: '<>' ;
439
440LESSEQUAL	: '<=' ;
441
442LEFTSHIFT	: '<<' ;
443
444GREATEREQUAL	: '>=' ;
445
446RIGHTSHIFT	: '>>' ;
447
448PLUSEQUAL	: '+=' ;
449
450MINUSEQUAL	: '-=' ;
451
452DOUBLESTAR	: '**' ;
453
454STAREQUAL	: '*=' ;
455
456DOUBLESLASH	: '//' ;
457
458SLASHEQUAL	: '/=' ;
459
460VBAREQUAL	: '|=' ;
461
462PERCENTEQUAL	: '%=' ;
463
464AMPEREQUAL	: '&=' ;
465
466CIRCUMFLEXEQUAL	: '^=' ;
467
468LEFTSHIFTEQUAL	: '<<=' ;
469
470RIGHTSHIFTEQUAL	: '>>=' ;
471
472DOUBLESTAREQUAL	: '**=' ;
473
474DOUBLESLASHEQUAL	: '//=' ;
475
476DOT : '.' ;
477
478FLOAT
479	:	'.' DIGITS (Exponent)?
480    |   DIGITS ('.' (DIGITS (Exponent)?)? | Exponent)
481    ;
482
483LONGINT
484    :   INT ('l'|'L')
485    ;
486
487fragment
488Exponent
489	:	('e' | 'E') ( '+' | '-' )? DIGITS
490	;
491
492INT :   // Hex
493        '0' ('x' | 'X') ( '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' )+
494        ('l' | 'L')?
495    |   // Octal
496        '0' DIGITS*
497    |   '1'..'9' DIGITS*
498    ;
499
500COMPLEX
501    :   INT ('j'|'J')
502    |   FLOAT ('j'|'J')
503    ;
504
505fragment
506DIGITS : ( '0' .. '9' )+ ;
507
508NAME:	( 'a' .. 'z' | 'A' .. 'Z' | '_')
509        ( 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' )*
510    ;
511
512/** Match various string types.  Note that greedy=false implies '''
513 *  should make us exit loop not continue.
514 */
515STRING
516    :   ('r'|'u'|'ur')?
517        (   '\'\'\'' (options {greedy=false;}:.)* '\'\'\''
518        |   '"""' (options {greedy=false;}:.)* '"""'
519        |   '"' (ESC|~('\\'|'\n'|'"'))* '"'
520        |   '\'' (ESC|~('\\'|'\n'|'\''))* '\''
521        )
522	;
523
524fragment
525ESC
526	:	'\\' .
527	;
528
529/** Consume a newline and any whitespace at start of next line */
530CONTINUED_LINE
531	:	'\\' ('\r')? '\n' (' '|'\t')* { $channel=HIDDEN; }
532	;
533
534/** Treat a sequence of blank lines as a single blank line.  If
535 *  nested within a (..), {..}, or [..], then ignore newlines.
536 *  If the first newline starts in column one, they are to be ignored.
537 */
538NEWLINE
539    :   (('\r')? '\n' )+
540        {if ( this.startPos==0 || this.implicitLineJoiningLevel>0 )
541            $channel=HIDDEN;
542        }
543    ;
544
545WS	:	{this.startPos>0}?=> (' '|'\t')+ {$channel=HIDDEN;}
546	;
547
548/** Grab everything before a real symbol.  Then if newline, kill it
549 *  as this is a blank line.  If whitespace followed by comment, kill it
550 *  as it's a comment on a line by itself.
551 *
552 *  Ignore leading whitespace when nested in [..], (..), {..}.
553 */
554LEADING_WS
555@init {
556    var spaces = 0;
557}
558    :   {this.startPos==0}?=>
559    	(   {this.implicitLineJoiningLevel>0}? ( ' ' | '\t' )+ {$channel=HIDDEN;}
560       	|	( 	' '  { spaces++; }
561        	|	'\t' { spaces += 8; spaces -= (spaces \% 8); }
562       		)+
563        	{
564            // make a string of n spaces where n is column number - 1
565            var indentation = new Array(spaces);
566            for (var i=0; i<spaces; i++) {
567                indentation[i] = ' ';
568            }
569            var s = indentation.join("");
570            this.emit(new org.antlr.runtime.CommonToken(this.LEADING_WS,s));
571        	}
572        	// kill trailing newline if present and then ignore
573        	( ('\r')? '\n' {if (this.state.token!=null) this.state.token.setChannel(HIDDEN); else $channel=HIDDEN;})*
574           // {this.token.setChannel(99); }
575        )
576    ;
577
578/** Comments not on line by themselves are turned into newlines.
579
580    b = a # end of line comment
581
582    or
583
584    a = [1, # weird
585         2]
586
587    This rule is invoked directly by nextToken when the comment is in
588    first column or when comment is on end of nonwhitespace line.
589
590	Only match \n here if we didn't start on left edge; let NEWLINE return that.
591	Kill if newlines if we live on a line by ourselves
592
593	Consume any leading whitespace if it starts on left edge.
594 */
595COMMENT
596@init {
597    $channel=HIDDEN;
598}
599    :	{this.startPos==0}?=> (' '|'\t')* '#' (~'\n')* '\n'+
600    |	{this.startPos>0}?=> '#' (~'\n')* // let NEWLINE handle \n unless char pos==0 for '#'
601    ;
602