1"""ANTLR3 runtime package"""
2
3# begin[licence]
4#
5# [The "BSD licence"]
6# Copyright (c) 2005-2008 Terence Parr
7# All rights reserved.
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. The name of the author may not be used to endorse or promote products
18#    derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30#
31# end[licence]
32
33import sys
34import inspect
35
36from antlr3 import compatible_api_versions
37from antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \
38     EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
39from antlr3.exceptions import RecognitionException, MismatchedTokenException, \
40     MismatchedRangeException, MismatchedTreeNodeException, \
41     NoViableAltException, EarlyExitException, MismatchedSetException, \
42     MismatchedNotSetException, FailedPredicateException, \
43     BacktrackingFailed, UnwantedTokenException, MissingTokenException
44from antlr3.tokens import CommonToken, SKIP_TOKEN
45from antlr3.compat import set, frozenset, reversed
46
47
48class RecognizerSharedState(object):
49    """
50    The set of fields needed by an abstract recognizer to recognize input
51    and recover from errors etc...  As a separate state object, it can be
52    shared among multiple grammars; e.g., when one grammar imports another.
53
54    These fields are publically visible but the actual state pointer per
55    parser is protected.
56    """
57
58    def __init__(self):
59        # Track the set of token types that can follow any rule invocation.
60        # Stack grows upwards.
61        self.following = []
62
63        # This is true when we see an error and before having successfully
64        # matched a token.  Prevents generation of more than one error message
65        # per error.
66        self.errorRecovery = False
67
68        # The index into the input stream where the last error occurred.
69        # This is used to prevent infinite loops where an error is found
70        # but no token is consumed during recovery...another error is found,
71        # ad naseum.  This is a failsafe mechanism to guarantee that at least
72        # one token/tree node is consumed for two errors.
73        self.lastErrorIndex = -1
74
75        # If 0, no backtracking is going on.  Safe to exec actions etc...
76        # If >0 then it's the level of backtracking.
77        self.backtracking = 0
78
79        # An array[size num rules] of Map<Integer,Integer> that tracks
80        # the stop token index for each rule.  ruleMemo[ruleIndex] is
81        # the memoization table for ruleIndex.  For key ruleStartIndex, you
82        # get back the stop token for associated rule or MEMO_RULE_FAILED.
83        #
84        # This is only used if rule memoization is on (which it is by default).
85        self.ruleMemo = None
86
87        ## Did the recognizer encounter a syntax error?  Track how many.
88        self.syntaxErrors = 0
89
90
91        # LEXER FIELDS (must be in same state object to avoid casting
92        # constantly in generated code and Lexer object) :(
93
94
95	## The goal of all lexer rules/methods is to create a token object.
96        # This is an instance variable as multiple rules may collaborate to
97        # create a single token.  nextToken will return this object after
98        # matching lexer rule(s).  If you subclass to allow multiple token
99        # emissions, then set this to the last token to be matched or
100        # something nonnull so that the auto token emit mechanism will not
101        # emit another token.
102        self.token = None
103
104        ## What character index in the stream did the current token start at?
105        # Needed, for example, to get the text for current token.  Set at
106        # the start of nextToken.
107        self.tokenStartCharIndex = -1
108
109        ## The line on which the first character of the token resides
110        self.tokenStartLine = None
111
112        ## The character position of first character within the line
113        self.tokenStartCharPositionInLine = None
114
115        ## The channel number for the current token
116        self.channel = None
117
118        ## The token type for the current token
119        self.type = None
120
121        ## You can set the text for the current token to override what is in
122        # the input char buffer.  Use setText() or can set this instance var.
123        self.text = None
124
125
126class BaseRecognizer(object):
127    """
128    @brief Common recognizer functionality.
129
130    A generic recognizer that can handle recognizers generated from
131    lexer, parser, and tree grammars.  This is all the parsing
132    support code essentially; most of it is error recovery stuff and
133    backtracking.
134    """
135
136    MEMO_RULE_FAILED = -2
137    MEMO_RULE_UNKNOWN = -1
138
139    # copies from Token object for convenience in actions
140    DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
141
142    # for convenience in actions
143    HIDDEN = HIDDEN_CHANNEL
144
145    # overridden by generated subclasses
146    tokenNames = None
147
148    # The api_version attribute has been introduced in 3.3. If it is not
149    # overwritten in the generated recognizer, we assume a default of v0.
150    api_version = 0
151
152    def __init__(self, state=None):
153        # Input stream of the recognizer. Must be initialized by a subclass.
154        self.input = None
155
156        ## State of a lexer, parser, or tree parser are collected into a state
157        # object so the state can be shared.  This sharing is needed to
158        # have one grammar import others and share same error variables
159        # and other state variables.  It's a kind of explicit multiple
160        # inheritance via delegation of methods and shared state.
161        if state is None:
162            state = RecognizerSharedState()
163        self._state = state
164
165        if self.api_version not in compatible_api_versions:
166            raise RuntimeError(
167                ("ANTLR version mismatch: "
168                 "The recognizer has been generated with API V%s, "
169                 "but this runtime does not support this.")
170                % self.api_version)
171
172    # this one only exists to shut up pylint :(
173    def setInput(self, input):
174        self.input = input
175
176
177    def reset(self):
178        """
179        reset the parser's state; subclasses must rewinds the input stream
180        """
181
182        # wack everything related to error recovery
183        if self._state is None:
184            # no shared state work to do
185            return
186
187        self._state.following = []
188        self._state.errorRecovery = False
189        self._state.lastErrorIndex = -1
190        self._state.syntaxErrors = 0
191        # wack everything related to backtracking and memoization
192        self._state.backtracking = 0
193        if self._state.ruleMemo is not None:
194            self._state.ruleMemo = {}
195
196
197    def match(self, input, ttype, follow):
198        """
199        Match current input symbol against ttype.  Attempt
200        single token insertion or deletion error recovery.  If
201        that fails, throw MismatchedTokenException.
202
203        To turn off single token insertion or deletion error
204        recovery, override recoverFromMismatchedToken() and have it
205        throw an exception. See TreeParser.recoverFromMismatchedToken().
206        This way any error in a rule will cause an exception and
207        immediate exit from rule.  Rule would recover by resynchronizing
208        to the set of symbols that can follow rule ref.
209        """
210
211        matchedSymbol = self.getCurrentInputSymbol(input)
212        if self.input.LA(1) == ttype:
213            self.input.consume()
214            self._state.errorRecovery = False
215            return matchedSymbol
216
217        if self._state.backtracking > 0:
218            # FIXME: need to return matchedSymbol here as well. damn!!
219            raise BacktrackingFailed
220
221        matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
222        return matchedSymbol
223
224
225    def matchAny(self, input):
226        """Match the wildcard: in a symbol"""
227
228        self._state.errorRecovery = False
229        self.input.consume()
230
231
232    def mismatchIsUnwantedToken(self, input, ttype):
233        return input.LA(2) == ttype
234
235
236    def mismatchIsMissingToken(self, input, follow):
237        if follow is None:
238            # we have no information about the follow; we can only consume
239            # a single token and hope for the best
240            return False
241
242        # compute what can follow this grammar element reference
243        if EOR_TOKEN_TYPE in follow:
244            viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
245            follow = follow | viableTokensFollowingThisRule
246
247            if len(self._state.following) > 0:
248                # remove EOR if we're not the start symbol
249                follow = follow - set([EOR_TOKEN_TYPE])
250
251        # if current token is consistent with what could come after set
252        # then we know we're missing a token; error recovery is free to
253        # "insert" the missing token
254        if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
255            return True
256
257        return False
258
259
260    def reportError(self, e):
261        """Report a recognition problem.
262
263        This method sets errorRecovery to indicate the parser is recovering
264        not parsing.  Once in recovery mode, no errors are generated.
265        To get out of recovery mode, the parser must successfully match
266        a token (after a resync).  So it will go:
267
268        1. error occurs
269        2. enter recovery mode, report error
270        3. consume until token found in resynch set
271        4. try to resume parsing
272        5. next match() will reset errorRecovery mode
273
274        If you override, make sure to update syntaxErrors if you care about
275        that.
276
277        """
278
279        # if we've already reported an error and have not matched a token
280        # yet successfully, don't report any errors.
281        if self._state.errorRecovery:
282            return
283
284        self._state.syntaxErrors += 1 # don't count spurious
285        self._state.errorRecovery = True
286
287        self.displayRecognitionError(self.tokenNames, e)
288
289
290    def displayRecognitionError(self, tokenNames, e):
291        hdr = self.getErrorHeader(e)
292        msg = self.getErrorMessage(e, tokenNames)
293        self.emitErrorMessage(hdr+" "+msg)
294
295
296    def getErrorMessage(self, e, tokenNames):
297        """
298        What error message should be generated for the various
299        exception types?
300
301        Not very object-oriented code, but I like having all error message
302        generation within one method rather than spread among all of the
303        exception classes. This also makes it much easier for the exception
304        handling because the exception classes do not have to have pointers back
305        to this object to access utility routines and so on. Also, changing
306        the message for an exception type would be difficult because you
307        would have to subclassing exception, but then somehow get ANTLR
308        to make those kinds of exception objects instead of the default.
309        This looks weird, but trust me--it makes the most sense in terms
310        of flexibility.
311
312        For grammar debugging, you will want to override this to add
313        more information such as the stack frame with
314        getRuleInvocationStack(e, this.getClass().getName()) and,
315        for no viable alts, the decision description and state etc...
316
317        Override this to change the message generated for one or more
318        exception types.
319        """
320
321        if isinstance(e, UnwantedTokenException):
322            tokenName = "<unknown>"
323            if e.expecting == EOF:
324                tokenName = "EOF"
325
326            else:
327                tokenName = self.tokenNames[e.expecting]
328
329            msg = "extraneous input %s expecting %s" % (
330                self.getTokenErrorDisplay(e.getUnexpectedToken()),
331                tokenName
332                )
333
334        elif isinstance(e, MissingTokenException):
335            tokenName = "<unknown>"
336            if e.expecting == EOF:
337                tokenName = "EOF"
338
339            else:
340                tokenName = self.tokenNames[e.expecting]
341
342            msg = "missing %s at %s" % (
343                tokenName, self.getTokenErrorDisplay(e.token)
344                )
345
346        elif isinstance(e, MismatchedTokenException):
347            tokenName = "<unknown>"
348            if e.expecting == EOF:
349                tokenName = "EOF"
350            else:
351                tokenName = self.tokenNames[e.expecting]
352
353            msg = "mismatched input " \
354                  + self.getTokenErrorDisplay(e.token) \
355                  + " expecting " \
356                  + tokenName
357
358        elif isinstance(e, MismatchedTreeNodeException):
359            tokenName = "<unknown>"
360            if e.expecting == EOF:
361                tokenName = "EOF"
362            else:
363                tokenName = self.tokenNames[e.expecting]
364
365            msg = "mismatched tree node: %s expecting %s" \
366                  % (e.node, tokenName)
367
368        elif isinstance(e, NoViableAltException):
369            msg = "no viable alternative at input " \
370                  + self.getTokenErrorDisplay(e.token)
371
372        elif isinstance(e, EarlyExitException):
373            msg = "required (...)+ loop did not match anything at input " \
374                  + self.getTokenErrorDisplay(e.token)
375
376        elif isinstance(e, MismatchedSetException):
377            msg = "mismatched input " \
378                  + self.getTokenErrorDisplay(e.token) \
379                  + " expecting set " \
380                  + repr(e.expecting)
381
382        elif isinstance(e, MismatchedNotSetException):
383            msg = "mismatched input " \
384                  + self.getTokenErrorDisplay(e.token) \
385                  + " expecting set " \
386                  + repr(e.expecting)
387
388        elif isinstance(e, FailedPredicateException):
389            msg = "rule " \
390                  + e.ruleName \
391                  + " failed predicate: {" \
392                  + e.predicateText \
393                  + "}?"
394
395        else:
396            msg = str(e)
397
398        return msg
399
400
401    def getNumberOfSyntaxErrors(self):
402        """
403        Get number of recognition errors (lexer, parser, tree parser).  Each
404        recognizer tracks its own number.  So parser and lexer each have
405        separate count.  Does not count the spurious errors found between
406        an error and next valid token match
407
408        See also reportError()
409	"""
410        return self._state.syntaxErrors
411
412
413    def getErrorHeader(self, e):
414        """
415        What is the error header, normally line/character position information?
416        """
417
418        source_name = self.getSourceName()
419        if source_name is not None:
420            return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine)
421        return "line %d:%d" % (e.line, e.charPositionInLine)
422
423
424    def getTokenErrorDisplay(self, t):
425        """
426        How should a token be displayed in an error message? The default
427        is to display just the text, but during development you might
428        want to have a lot of information spit out.  Override in that case
429        to use t.toString() (which, for CommonToken, dumps everything about
430        the token). This is better than forcing you to override a method in
431        your token objects because you don't have to go modify your lexer
432        so that it creates a new Java type.
433        """
434
435        s = t.text
436        if s is None:
437            if t.type == EOF:
438                s = "<EOF>"
439            else:
440                s = "<"+t.type+">"
441
442        return repr(s)
443
444
445    def emitErrorMessage(self, msg):
446        """Override this method to change where error messages go"""
447        sys.stderr.write(msg + '\n')
448
449
450    def recover(self, input, re):
451        """
452        Recover from an error found on the input stream.  This is
453        for NoViableAlt and mismatched symbol exceptions.  If you enable
454        single token insertion and deletion, this will usually not
455        handle mismatched symbol exceptions but there could be a mismatched
456        token that the match() routine could not recover from.
457        """
458
459        # PROBLEM? what if input stream is not the same as last time
460        # perhaps make lastErrorIndex a member of input
461        if self._state.lastErrorIndex == input.index():
462            # uh oh, another error at same token index; must be a case
463            # where LT(1) is in the recovery token set so nothing is
464            # consumed; consume a single token so at least to prevent
465            # an infinite loop; this is a failsafe.
466            input.consume()
467
468        self._state.lastErrorIndex = input.index()
469        followSet = self.computeErrorRecoverySet()
470
471        self.beginResync()
472        self.consumeUntil(input, followSet)
473        self.endResync()
474
475
476    def beginResync(self):
477        """
478        A hook to listen in on the token consumption during error recovery.
479        The DebugParser subclasses this to fire events to the listenter.
480        """
481
482        pass
483
484
485    def endResync(self):
486        """
487        A hook to listen in on the token consumption during error recovery.
488        The DebugParser subclasses this to fire events to the listenter.
489        """
490
491        pass
492
493
494    def computeErrorRecoverySet(self):
495        """
496        Compute the error recovery set for the current rule.  During
497        rule invocation, the parser pushes the set of tokens that can
498        follow that rule reference on the stack; this amounts to
499        computing FIRST of what follows the rule reference in the
500        enclosing rule. This local follow set only includes tokens
501        from within the rule; i.e., the FIRST computation done by
502        ANTLR stops at the end of a rule.
503
504        EXAMPLE
505
506        When you find a "no viable alt exception", the input is not
507        consistent with any of the alternatives for rule r.  The best
508        thing to do is to consume tokens until you see something that
509        can legally follow a call to r *or* any rule that called r.
510        You don't want the exact set of viable next tokens because the
511        input might just be missing a token--you might consume the
512        rest of the input looking for one of the missing tokens.
513
514        Consider grammar:
515
516        a : '[' b ']'
517          | '(' b ')'
518          ;
519        b : c '^' INT ;
520        c : ID
521          | INT
522          ;
523
524        At each rule invocation, the set of tokens that could follow
525        that rule is pushed on a stack.  Here are the various "local"
526        follow sets:
527
528        FOLLOW(b1_in_a) = FIRST(']') = ']'
529        FOLLOW(b2_in_a) = FIRST(')') = ')'
530        FOLLOW(c_in_b) = FIRST('^') = '^'
531
532        Upon erroneous input "[]", the call chain is
533
534        a -> b -> c
535
536        and, hence, the follow context stack is:
537
538        depth  local follow set     after call to rule
539          0         \<EOF>                    a (from main())
540          1          ']'                     b
541          3          '^'                     c
542
543        Notice that ')' is not included, because b would have to have
544        been called from a different context in rule a for ')' to be
545        included.
546
547        For error recovery, we cannot consider FOLLOW(c)
548        (context-sensitive or otherwise).  We need the combined set of
549        all context-sensitive FOLLOW sets--the set of all tokens that
550        could follow any reference in the call chain.  We need to
551        resync to one of those tokens.  Note that FOLLOW(c)='^' and if
552        we resync'd to that token, we'd consume until EOF.  We need to
553        sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
554        In this case, for input "[]", LA(1) is in this set so we would
555        not consume anything and after printing an error rule c would
556        return normally.  It would not find the required '^' though.
557        At this point, it gets a mismatched token error and throws an
558        exception (since LA(1) is not in the viable following token
559        set).  The rule exception handler tries to recover, but finds
560        the same recovery set and doesn't consume anything.  Rule b
561        exits normally returning to rule a.  Now it finds the ']' (and
562        with the successful match exits errorRecovery mode).
563
564        So, you cna see that the parser walks up call chain looking
565        for the token that was a member of the recovery set.
566
567        Errors are not generated in errorRecovery mode.
568
569        ANTLR's error recovery mechanism is based upon original ideas:
570
571        "Algorithms + Data Structures = Programs" by Niklaus Wirth
572
573        and
574
575        "A note on error recovery in recursive descent parsers":
576        http://portal.acm.org/citation.cfm?id=947902.947905
577
578        Later, Josef Grosch had some good ideas:
579
580        "Efficient and Comfortable Error Recovery in Recursive Descent
581        Parsers":
582        ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
583
584        Like Grosch I implemented local FOLLOW sets that are combined
585        at run-time upon error to avoid overhead during parsing.
586        """
587
588        return self.combineFollows(False)
589
590
591    def computeContextSensitiveRuleFOLLOW(self):
592        """
593        Compute the context-sensitive FOLLOW set for current rule.
594        This is set of token types that can follow a specific rule
595        reference given a specific call chain.  You get the set of
596        viable tokens that can possibly come next (lookahead depth 1)
597        given the current call chain.  Contrast this with the
598        definition of plain FOLLOW for rule r:
599
600         FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
601
602        where x in T* and alpha, beta in V*; T is set of terminals and
603        V is the set of terminals and nonterminals.  In other words,
604        FOLLOW(r) is the set of all tokens that can possibly follow
605        references to r in *any* sentential form (context).  At
606        runtime, however, we know precisely which context applies as
607        we have the call chain.  We may compute the exact (rather
608        than covering superset) set of following tokens.
609
610        For example, consider grammar:
611
612        stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
613             | "return" expr '.'
614             ;
615        expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
616        atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
617             | '(' expr ')'
618             ;
619
620        The FOLLOW sets are all inclusive whereas context-sensitive
621        FOLLOW sets are precisely what could follow a rule reference.
622        For input input "i=(3);", here is the derivation:
623
624        stat => ID '=' expr ';'
625             => ID '=' atom ('+' atom)* ';'
626             => ID '=' '(' expr ')' ('+' atom)* ';'
627             => ID '=' '(' atom ')' ('+' atom)* ';'
628             => ID '=' '(' INT ')' ('+' atom)* ';'
629             => ID '=' '(' INT ')' ';'
630
631        At the "3" token, you'd have a call chain of
632
633          stat -> expr -> atom -> expr -> atom
634
635        What can follow that specific nested ref to atom?  Exactly ')'
636        as you can see by looking at the derivation of this specific
637        input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
638
639        You want the exact viable token set when recovering from a
640        token mismatch.  Upon token mismatch, if LA(1) is member of
641        the viable next token set, then you know there is most likely
642        a missing token in the input stream.  "Insert" one by just not
643        throwing an exception.
644        """
645
646        return self.combineFollows(True)
647
648
649    def combineFollows(self, exact):
650        followSet = set()
651        for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
652            followSet |= localFollowSet
653            if exact:
654                # can we see end of rule?
655                if EOR_TOKEN_TYPE in localFollowSet:
656                    # Only leave EOR in set if at top (start rule); this lets
657                    # us know if have to include follow(start rule); i.e., EOF
658                    if idx > 0:
659                        followSet.remove(EOR_TOKEN_TYPE)
660
661                else:
662                    # can't see end of rule, quit
663                    break
664
665        return followSet
666
667
668    def recoverFromMismatchedToken(self, input, ttype, follow):
669        """Attempt to recover from a single missing or extra token.
670
671        EXTRA TOKEN
672
673        LA(1) is not what we are looking for.  If LA(2) has the right token,
674        however, then assume LA(1) is some extra spurious token.  Delete it
675        and LA(2) as if we were doing a normal match(), which advances the
676        input.
677
678        MISSING TOKEN
679
680        If current token is consistent with what could come after
681        ttype then it is ok to 'insert' the missing token, else throw
682        exception For example, Input 'i=(3;' is clearly missing the
683        ')'.  When the parser returns from the nested call to expr, it
684        will have call chain:
685
686          stat -> expr -> atom
687
688        and it will be trying to match the ')' at this point in the
689        derivation:
690
691             => ID '=' '(' INT ')' ('+' atom)* ';'
692                                ^
693        match() will see that ';' doesn't match ')' and report a
694        mismatched token error.  To recover, it sees that LA(1)==';'
695        is in the set of tokens that can follow the ')' token
696        reference in rule atom.  It can assume that you forgot the ')'.
697        """
698
699        e = None
700
701        # if next token is what we are looking for then "delete" this token
702        if self.mismatchIsUnwantedToken(input, ttype):
703            e = UnwantedTokenException(ttype, input)
704
705            self.beginResync()
706            input.consume() # simply delete extra token
707            self.endResync()
708
709            # report after consuming so AW sees the token in the exception
710            self.reportError(e)
711
712            # we want to return the token we're actually matching
713            matchedSymbol = self.getCurrentInputSymbol(input)
714
715            # move past ttype token as if all were ok
716            input.consume()
717            return matchedSymbol
718
719        # can't recover with single token deletion, try insertion
720        if self.mismatchIsMissingToken(input, follow):
721            inserted = self.getMissingSymbol(input, e, ttype, follow)
722            e = MissingTokenException(ttype, input, inserted)
723
724            # report after inserting so AW sees the token in the exception
725            self.reportError(e)
726            return inserted
727
728        # even that didn't work; must throw the exception
729        e = MismatchedTokenException(ttype, input)
730        raise e
731
732
733    def recoverFromMismatchedSet(self, input, e, follow):
734        """Not currently used"""
735
736        if self.mismatchIsMissingToken(input, follow):
737            self.reportError(e)
738            # we don't know how to conjure up a token for sets yet
739            return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
740
741        # TODO do single token deletion like above for Token mismatch
742        raise e
743
744
745    def getCurrentInputSymbol(self, input):
746        """
747        Match needs to return the current input symbol, which gets put
748        into the label for the associated token ref; e.g., x=ID.  Token
749        and tree parsers need to return different objects. Rather than test
750        for input stream type or change the IntStream interface, I use
751        a simple method to ask the recognizer to tell me what the current
752        input symbol is.
753
754        This is ignored for lexers.
755        """
756
757        return None
758
759
760    def getMissingSymbol(self, input, e, expectedTokenType, follow):
761        """Conjure up a missing token during error recovery.
762
763        The recognizer attempts to recover from single missing
764        symbols. But, actions might refer to that missing symbol.
765        For example, x=ID {f($x);}. The action clearly assumes
766        that there has been an identifier matched previously and that
767        $x points at that token. If that token is missing, but
768        the next token in the stream is what we want we assume that
769        this token is missing and we keep going. Because we
770        have to return some token to replace the missing token,
771        we have to conjure one up. This method gives the user control
772        over the tokens returned for missing tokens. Mostly,
773        you will want to create something special for identifier
774        tokens. For literals such as '{' and ',', the default
775        action in the parser or tree parser works. It simply creates
776        a CommonToken of the appropriate type. The text will be the token.
777        If you change what tokens must be created by the lexer,
778        override this method to create the appropriate tokens.
779        """
780
781        return None
782
783
784##     def recoverFromMissingElement(self, input, e, follow):
785##         """
786##         This code is factored out from mismatched token and mismatched set
787##         recovery.  It handles "single token insertion" error recovery for
788##         both.  No tokens are consumed to recover from insertions.  Return
789##         true if recovery was possible else return false.
790##         """
791
792##         if self.mismatchIsMissingToken(input, follow):
793##             self.reportError(e)
794##             return True
795
796##         # nothing to do; throw exception
797##         return False
798
799
800    def consumeUntil(self, input, tokenTypes):
801        """
802        Consume tokens until one matches the given token or token set
803
804        tokenTypes can be a single token type or a set of token types
805
806        """
807
808        if not isinstance(tokenTypes, (set, frozenset)):
809            tokenTypes = frozenset([tokenTypes])
810
811        ttype = input.LA(1)
812        while ttype != EOF and ttype not in tokenTypes:
813            input.consume()
814            ttype = input.LA(1)
815
816
817    def getRuleInvocationStack(self):
818        """
819        Return List<String> of the rules in your parser instance
820        leading up to a call to this method.  You could override if
821        you want more details such as the file/line info of where
822        in the parser java code a rule is invoked.
823
824        This is very useful for error messages and for context-sensitive
825        error recovery.
826
827        You must be careful, if you subclass a generated recognizers.
828        The default implementation will only search the module of self
829        for rules, but the subclass will not contain any rules.
830        You probably want to override this method to look like
831
832        def getRuleInvocationStack(self):
833            return self._getRuleInvocationStack(<class>.__module__)
834
835        where <class> is the class of the generated recognizer, e.g.
836        the superclass of self.
837        """
838
839        return self._getRuleInvocationStack(self.__module__)
840
841
842    def _getRuleInvocationStack(cls, module):
843        """
844        A more general version of getRuleInvocationStack where you can
845        pass in, for example, a RecognitionException to get it's rule
846        stack trace.  This routine is shared with all recognizers, hence,
847        static.
848
849        TODO: move to a utility class or something; weird having lexer call
850        this
851        """
852
853        # mmmhhh,... perhaps look at the first argument
854        # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of
855        # requested recognizer...
856
857        rules = []
858        for frame in reversed(inspect.stack()):
859            code = frame[0].f_code
860            codeMod = inspect.getmodule(code)
861            if codeMod is None:
862                continue
863
864            # skip frames not in requested module
865            if codeMod.__name__ != module:
866                continue
867
868            # skip some unwanted names
869            if code.co_name in ('nextToken', '<module>'):
870                continue
871
872            rules.append(code.co_name)
873
874        return rules
875
876    _getRuleInvocationStack = classmethod(_getRuleInvocationStack)
877
878
879    def getBacktrackingLevel(self):
880        return self._state.backtracking
881
882    def setBacktrackingLevel(self, n):
883        self._state.backtracking = n
884
885
886    def getGrammarFileName(self):
887        """For debugging and other purposes, might want the grammar name.
888
889        Have ANTLR generate an implementation for this method.
890        """
891
892        return self.grammarFileName
893
894
895    def getSourceName(self):
896        raise NotImplementedError
897
898
899    def toStrings(self, tokens):
900        """A convenience method for use most often with template rewrites.
901
902        Convert a List<Token> to List<String>
903        """
904
905        if tokens is None:
906            return None
907
908        return [token.text for token in tokens]
909
910
911    def getRuleMemoization(self, ruleIndex, ruleStartIndex):
912        """
913        Given a rule number and a start token index number, return
914        MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
915        start index.  If this rule has parsed input starting from the
916        start index before, then return where the rule stopped parsing.
917        It returns the index of the last token matched by the rule.
918        """
919
920        if ruleIndex not in self._state.ruleMemo:
921            self._state.ruleMemo[ruleIndex] = {}
922
923        return self._state.ruleMemo[ruleIndex].get(
924            ruleStartIndex, self.MEMO_RULE_UNKNOWN
925            )
926
927
928    def alreadyParsedRule(self, input, ruleIndex):
929        """
930        Has this rule already parsed input at the current index in the
931        input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
932        If we attempted but failed to parse properly before, return
933        MEMO_RULE_FAILED.
934
935        This method has a side-effect: if we have seen this input for
936        this rule and successfully parsed before, then seek ahead to
937        1 past the stop token matched for this rule last time.
938        """
939
940        stopIndex = self.getRuleMemoization(ruleIndex, input.index())
941        if stopIndex == self.MEMO_RULE_UNKNOWN:
942            return False
943
944        if stopIndex == self.MEMO_RULE_FAILED:
945            raise BacktrackingFailed
946
947        else:
948            input.seek(stopIndex + 1)
949
950        return True
951
952
953    def memoize(self, input, ruleIndex, ruleStartIndex, success):
954        """
955        Record whether or not this rule parsed the input at this position
956        successfully.
957        """
958
959        if success:
960            stopTokenIndex = input.index() - 1
961        else:
962            stopTokenIndex = self.MEMO_RULE_FAILED
963
964        if ruleIndex in self._state.ruleMemo:
965            self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
966
967
968    def traceIn(self, ruleName, ruleIndex, inputSymbol):
969        sys.stdout.write("enter %s %s" % (ruleName, inputSymbol))
970
971        if self._state.backtracking > 0:
972            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
973
974        sys.stdout.write('\n')
975
976
977    def traceOut(self, ruleName, ruleIndex, inputSymbol):
978        sys.stdout.write("exit %s %s" % (ruleName, inputSymbol))
979
980        if self._state.backtracking > 0:
981            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
982
983        # mmmm... we use BacktrackingFailed exceptions now. So how could we
984        # get that information here?
985        #if self._state.failed:
986        #    sys.stdout.write(" failed")
987        #else:
988        #    sys.stdout.write(" succeeded")
989
990        sys.stdout.write('\n')
991
992
993class TokenSource(object):
994    """
995    @brief Abstract baseclass for token producers.
996
997    A source of tokens must provide a sequence of tokens via nextToken()
998    and also must reveal it's source of characters; CommonToken's text is
999    computed from a CharStream; it only store indices into the char stream.
1000
1001    Errors from the lexer are never passed to the parser.  Either you want
1002    to keep going or you do not upon token recognition error.  If you do not
1003    want to continue lexing then you do not want to continue parsing.  Just
1004    throw an exception not under RecognitionException and Java will naturally
1005    toss you all the way out of the recognizers.  If you want to continue
1006    lexing then you should not throw an exception to the parser--it has already
1007    requested a token.  Keep lexing until you get a valid one.  Just report
1008    errors and keep going, looking for a valid token.
1009    """
1010
1011    def nextToken(self):
1012        """Return a Token object from your input stream (usually a CharStream).
1013
1014        Do not fail/return upon lexing error; keep chewing on the characters
1015        until you get a good one; errors are not passed through to the parser.
1016        """
1017
1018        raise NotImplementedError
1019
1020
1021    def __iter__(self):
1022        """The TokenSource is an interator.
1023
1024        The iteration will not include the final EOF token, see also the note
1025        for the next() method.
1026
1027        """
1028
1029        return self
1030
1031
1032    def next(self):
1033        """Return next token or raise StopIteration.
1034
1035        Note that this will raise StopIteration when hitting the EOF token,
1036        so EOF will not be part of the iteration.
1037
1038        """
1039
1040        token = self.nextToken()
1041        if token is None or token.type == EOF:
1042            raise StopIteration
1043        return token
1044
1045
1046class Lexer(BaseRecognizer, TokenSource):
1047    """
1048    @brief Baseclass for generated lexer classes.
1049
1050    A lexer is recognizer that draws input symbols from a character stream.
1051    lexer grammars result in a subclass of this object. A Lexer object
1052    uses simplified match() and error recovery mechanisms in the interest
1053    of speed.
1054    """
1055
1056    def __init__(self, input, state=None):
1057        BaseRecognizer.__init__(self, state)
1058        TokenSource.__init__(self)
1059
1060        # Where is the lexer drawing characters from?
1061        self.input = input
1062
1063
1064    def reset(self):
1065        BaseRecognizer.reset(self) # reset all recognizer state variables
1066
1067        if self.input is not None:
1068            # rewind the input
1069            self.input.seek(0)
1070
1071        if self._state is None:
1072            # no shared state work to do
1073            return
1074
1075        # wack Lexer state variables
1076        self._state.token = None
1077        self._state.type = INVALID_TOKEN_TYPE
1078        self._state.channel = DEFAULT_CHANNEL
1079        self._state.tokenStartCharIndex = -1
1080        self._state.tokenStartLine = -1
1081        self._state.tokenStartCharPositionInLine = -1
1082        self._state.text = None
1083
1084
1085    def makeEOFToken(self):
1086        eof = CommonToken(
1087            type=EOF, channel=DEFAULT_CHANNEL,
1088            input=self.input,
1089            start=self.input.index(), stop=self.input.index())
1090        eof.line = self.input.line
1091        eof.charPositionInLine = self.input.charPositionInLine
1092        return eof
1093
1094    def nextToken(self):
1095        """
1096        Return a token from this source; i.e., match a token on the char
1097        stream.
1098        """
1099
1100        while 1:
1101            self._state.token = None
1102            self._state.channel = DEFAULT_CHANNEL
1103            self._state.tokenStartCharIndex = self.input.index()
1104            self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
1105            self._state.tokenStartLine = self.input.line
1106            self._state.text = None
1107            if self.input.LA(1) == EOF:
1108                return self.makeEOFToken()
1109
1110            try:
1111                self.mTokens()
1112
1113                if self._state.token is None:
1114                    self.emit()
1115
1116                elif self._state.token == SKIP_TOKEN:
1117                    continue
1118
1119                return self._state.token
1120
1121            except NoViableAltException, re:
1122                self.reportError(re)
1123                self.recover(re) # throw out current char and try again
1124
1125            except RecognitionException, re:
1126                self.reportError(re)
1127                # match() routine has already called recover()
1128
1129
1130    def skip(self):
1131        """
1132        Instruct the lexer to skip creating a token for current lexer rule
1133        and look for another token.  nextToken() knows to keep looking when
1134        a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
1135        if token==null at end of any token rule, it creates one for you
1136        and emits it.
1137        """
1138
1139        self._state.token = SKIP_TOKEN
1140
1141
1142    def mTokens(self):
1143        """This is the lexer entry point that sets instance var 'token'"""
1144
1145        # abstract method
1146        raise NotImplementedError
1147
1148
1149    def setCharStream(self, input):
1150        """Set the char stream and reset the lexer"""
1151        self.input = None
1152        self.reset()
1153        self.input = input
1154
1155
1156    def getSourceName(self):
1157        return self.input.getSourceName()
1158
1159
1160    def emit(self, token=None):
1161        """
1162        The standard method called to automatically emit a token at the
1163        outermost lexical rule.  The token object should point into the
1164        char buffer start..stop.  If there is a text override in 'text',
1165        use that to set the token's text.  Override this method to emit
1166        custom Token objects.
1167
1168        If you are building trees, then you should also override
1169        Parser or TreeParser.getMissingSymbol().
1170        """
1171
1172        if token is None:
1173            token = CommonToken(
1174                input=self.input,
1175                type=self._state.type,
1176                channel=self._state.channel,
1177                start=self._state.tokenStartCharIndex,
1178                stop=self.getCharIndex()-1
1179                )
1180            token.line = self._state.tokenStartLine
1181            token.text = self._state.text
1182            token.charPositionInLine = self._state.tokenStartCharPositionInLine
1183
1184        self._state.token = token
1185
1186        return token
1187
1188
1189    def match(self, s):
1190        if isinstance(s, basestring):
1191            for c in s:
1192                if self.input.LA(1) != ord(c):
1193                    if self._state.backtracking > 0:
1194                        raise BacktrackingFailed
1195
1196                    mte = MismatchedTokenException(c, self.input)
1197                    self.recover(mte)
1198                    raise mte
1199
1200                self.input.consume()
1201
1202        else:
1203            if self.input.LA(1) != s:
1204                if self._state.backtracking > 0:
1205                    raise BacktrackingFailed
1206
1207                mte = MismatchedTokenException(unichr(s), self.input)
1208                self.recover(mte) # don't really recover; just consume in lexer
1209                raise mte
1210
1211            self.input.consume()
1212
1213
1214    def matchAny(self):
1215        self.input.consume()
1216
1217
1218    def matchRange(self, a, b):
1219        if self.input.LA(1) < a or self.input.LA(1) > b:
1220            if self._state.backtracking > 0:
1221                raise BacktrackingFailed
1222
1223            mre = MismatchedRangeException(unichr(a), unichr(b), self.input)
1224            self.recover(mre)
1225            raise mre
1226
1227        self.input.consume()
1228
1229
1230    def getLine(self):
1231        return self.input.line
1232
1233
1234    def getCharPositionInLine(self):
1235        return self.input.charPositionInLine
1236
1237
1238    def getCharIndex(self):
1239        """What is the index of the current character of lookahead?"""
1240
1241        return self.input.index()
1242
1243
1244    def getText(self):
1245        """
1246        Return the text matched so far for the current token or any
1247        text override.
1248        """
1249        if self._state.text is not None:
1250            return self._state.text
1251
1252        return self.input.substring(
1253            self._state.tokenStartCharIndex,
1254            self.getCharIndex()-1
1255            )
1256
1257
1258    def setText(self, text):
1259        """
1260        Set the complete text of this token; it wipes any previous
1261        changes to the text.
1262        """
1263        self._state.text = text
1264
1265
1266    text = property(getText, setText)
1267
1268
1269    def reportError(self, e):
1270        ## TODO: not thought about recovery in lexer yet.
1271
1272        ## # if we've already reported an error and have not matched a token
1273        ## # yet successfully, don't report any errors.
1274        ## if self.errorRecovery:
1275        ##     #System.err.print("[SPURIOUS] ");
1276        ##     return;
1277        ##
1278        ## self.errorRecovery = True
1279
1280        self.displayRecognitionError(self.tokenNames, e)
1281
1282
1283    def getErrorMessage(self, e, tokenNames):
1284        msg = None
1285
1286        if isinstance(e, MismatchedTokenException):
1287            msg = "mismatched character " \
1288                  + self.getCharErrorDisplay(e.c) \
1289                  + " expecting " \
1290                  + self.getCharErrorDisplay(e.expecting)
1291
1292        elif isinstance(e, NoViableAltException):
1293            msg = "no viable alternative at character " \
1294                  + self.getCharErrorDisplay(e.c)
1295
1296        elif isinstance(e, EarlyExitException):
1297            msg = "required (...)+ loop did not match anything at character " \
1298                  + self.getCharErrorDisplay(e.c)
1299
1300        elif isinstance(e, MismatchedNotSetException):
1301            msg = "mismatched character " \
1302                  + self.getCharErrorDisplay(e.c) \
1303                  + " expecting set " \
1304                  + repr(e.expecting)
1305
1306        elif isinstance(e, MismatchedSetException):
1307            msg = "mismatched character " \
1308                  + self.getCharErrorDisplay(e.c) \
1309                  + " expecting set " \
1310                  + repr(e.expecting)
1311
1312        elif isinstance(e, MismatchedRangeException):
1313            msg = "mismatched character " \
1314                  + self.getCharErrorDisplay(e.c) \
1315                  + " expecting set " \
1316                  + self.getCharErrorDisplay(e.a) \
1317                  + ".." \
1318                  + self.getCharErrorDisplay(e.b)
1319
1320        else:
1321            msg = BaseRecognizer.getErrorMessage(self, e, tokenNames)
1322
1323        return msg
1324
1325
1326    def getCharErrorDisplay(self, c):
1327        if c == EOF:
1328            c = '<EOF>'
1329        return repr(c)
1330
1331
1332    def recover(self, re):
1333        """
1334        Lexers can normally match any char in it's vocabulary after matching
1335        a token, so do the easy thing and just kill a character and hope
1336        it all works out.  You can instead use the rule invocation stack
1337        to do sophisticated error recovery if you are in a fragment rule.
1338        """
1339
1340        self.input.consume()
1341
1342
1343    def traceIn(self, ruleName, ruleIndex):
1344        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1345                                         self.getLine(),
1346                                         self.getCharPositionInLine()
1347                                         )
1348
1349        BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol)
1350
1351
1352    def traceOut(self, ruleName, ruleIndex):
1353        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1354                                         self.getLine(),
1355                                         self.getCharPositionInLine()
1356                                         )
1357
1358        BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol)
1359
1360
1361
1362class Parser(BaseRecognizer):
1363    """
1364    @brief Baseclass for generated parser classes.
1365    """
1366
1367    def __init__(self, lexer, state=None):
1368        BaseRecognizer.__init__(self, state)
1369
1370        self.input = lexer
1371
1372
1373    def reset(self):
1374        BaseRecognizer.reset(self) # reset all recognizer state variables
1375        if self.input is not None:
1376            self.input.seek(0) # rewind the input
1377
1378
1379    def getCurrentInputSymbol(self, input):
1380        return input.LT(1)
1381
1382
1383    def getMissingSymbol(self, input, e, expectedTokenType, follow):
1384        if expectedTokenType == EOF:
1385            tokenText = "<missing EOF>"
1386        else:
1387            tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">"
1388        t = CommonToken(type=expectedTokenType, text=tokenText)
1389        current = input.LT(1)
1390        if current.type == EOF:
1391            current = input.LT(-1)
1392
1393        if current is not None:
1394            t.line = current.line
1395            t.charPositionInLine = current.charPositionInLine
1396        t.channel = DEFAULT_CHANNEL
1397        return t
1398
1399
1400    def setTokenStream(self, input):
1401        """Set the token stream and reset the parser"""
1402
1403        self.input = None
1404        self.reset()
1405        self.input = input
1406
1407
1408    def getTokenStream(self):
1409        return self.input
1410
1411
1412    def getSourceName(self):
1413        return self.input.getSourceName()
1414
1415
1416    def traceIn(self, ruleName, ruleIndex):
1417        BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1))
1418
1419
1420    def traceOut(self, ruleName, ruleIndex):
1421        BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1))
1422
1423
1424class RuleReturnScope(object):
1425    """
1426    Rules can return start/stop info as well as possible trees and templates.
1427    """
1428
1429    def getStart(self):
1430        """Return the start token or tree."""
1431        return None
1432
1433
1434    def getStop(self):
1435        """Return the stop token or tree."""
1436        return None
1437
1438
1439    def getTree(self):
1440        """Has a value potentially if output=AST."""
1441        return None
1442
1443
1444    def getTemplate(self):
1445        """Has a value potentially if output=template."""
1446        return None
1447
1448
1449class ParserRuleReturnScope(RuleReturnScope):
1450    """
1451    Rules that return more than a single value must return an object
1452    containing all the values.  Besides the properties defined in
1453    RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
1454    return values.  This class simply defines the minimum properties that
1455    are always defined and methods to access the others that might be
1456    available depending on output option such as template and tree.
1457
1458    Note text is not an actual property of the return value, it is computed
1459    from start and stop using the input stream's toString() method.  I
1460    could add a ctor to this so that we can pass in and store the input
1461    stream, but I'm not sure we want to do that.  It would seem to be undefined
1462    to get the .text property anyway if the rule matches tokens from multiple
1463    input streams.
1464
1465    I do not use getters for fields of objects that are used simply to
1466    group values such as this aggregate.  The getters/setters are there to
1467    satisfy the superclass interface.
1468    """
1469
1470    def __init__(self):
1471        self.start = None
1472        self.stop = None
1473        self.tree = None  # only used when output=AST
1474
1475
1476    def getStart(self):
1477        return self.start
1478
1479
1480    def getStop(self):
1481        return self.stop
1482
1483
1484    def getTree(self):
1485        return self.tree
1486