15e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
25e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Licensed to PSF under a Contributor Agreement.
35e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
45e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis"""Parser engine for the grammar tables generated by pgen.
55e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
65e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. LöwisThe grammar table must be loaded first.
75e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
85e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. LöwisSee Parser/parser.c in the Python distribution for additional info on
95e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwishow this parsing engine works.
105e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
115e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis"""
125e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
135e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Local imports
145e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisfrom . import token
155e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
165e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisclass ParseError(Exception):
175e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    """Exception to signal the parser is stuck."""
185e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
195e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def __init__(self, msg, type, value, context):
205e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
215e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                           (msg, type, value, context))
225e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.msg = msg
235e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.type = type
245e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.value = value
255e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.context = context
265e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
275e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisclass Parser(object):
285e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    """Parser engine.
295e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
305e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    The proper usage sequence is:
315e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
325e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    p = Parser(grammar, [converter])  # create instance
335e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    p.setup([start])                  # prepare for parsing
345e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    <for each input token>:
355e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if p.addtoken(...):           # parse a token; may raise ParseError
365e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            break
375e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    root = p.rootnode                 # root of abstract syntax tree
385e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
395e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    A Parser instance may be reused by calling setup() repeatedly.
405e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
415e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    A Parser instance contains state pertaining to the current token
425e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    sequence, and should not be used concurrently by different threads
435e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    to parse separate token sequences.
445e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
455e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    See driver.py for how to get input tokens by tokenizing a file or
465e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    string.
475e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
485e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    Parsing is complete when addtoken() returns True; the root of the
495e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    abstract syntax tree can then be retrieved from the rootnode
505e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    instance variable.  When a syntax error occurs, addtoken() raises
515e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    the ParseError exception.  There is no error recovery; the parser
525e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    cannot be used after a syntax error was reported (but it can be
535e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    reinitialized by calling setup()).
545e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
555e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    """
565e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
575e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def __init__(self, grammar, convert=None):
585e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Constructor.
595e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
605e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        The grammar argument is a grammar.Grammar instance; see the
615e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        grammar module for more information.
625e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
635e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        The parser is not ready yet for parsing; you must call the
645e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        setup() method to get it started.
655e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
665e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        The optional convert argument is a function mapping concrete
675e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        syntax tree nodes to abstract syntax tree nodes.  If not
685e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        given, no conversion is done and the syntax tree produced is
695e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        the concrete syntax tree.  If given, it must be a function of
705e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        two arguments, the first being the grammar (a grammar.Grammar
715e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        instance), and the second being the concrete syntax tree node
725e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        to be converted.  The syntax tree is converted from the bottom
735e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        up.
745e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
755e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        A concrete syntax tree node is a (type, value, context, nodes)
765e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        tuple, where type is the node type (a token or symbol number),
775e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        value is None for symbols and a string for tokens, context is
785e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        None or an opaque value used for error reporting (typically a
795e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        (lineno, offset) pair), and nodes is a list of children for
805e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        symbols, and None for tokens.
815e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
825e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        An abstract syntax tree node may be anything; this is entirely
835e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        up to the converter function.
845e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
855e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """
865e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.grammar = grammar
875e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.convert = convert or (lambda grammar, node: node)
885e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
895e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def setup(self, start=None):
905e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Prepare for parsing.
915e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
925e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        This *must* be called before starting to parse.
935e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
945e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        The optional argument is an alternative start symbol; it
955e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        defaults to the grammar's start symbol.
965e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
975e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        You can use a Parser instance to parse any number of programs;
985e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        each time you call setup() the parser is reset to an initial
995e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        state determined by the (implicit or explicit) start symbol.
1005e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1015e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """
1025e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if start is None:
1035e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            start = self.grammar.start
1045e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        # Each stack entry is a tuple: (dfa, state, node).
1055e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        # A node is a tuple: (type, value, context, children),
1065e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        # where children is a list of nodes or None, and context may be None.
1075e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        newnode = (start, None, None, [])
1085e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        stackentry = (self.grammar.dfas[start], 0, newnode)
1095e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.stack = [stackentry]
1105e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.rootnode = None
1115e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.used_names = set() # Aliased to self.rootnode.used_names in pop()
1125e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1135e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def addtoken(self, type, value, context):
1145e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Add a token; return True iff this is the end of the program."""
1155e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        # Map from token to label
1165e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        ilabel = self.classify(type, value, context)
1175e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        # Loop until the token is shifted; may raise exceptions
1185e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        while True:
1195e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            dfa, state, node = self.stack[-1]
1205e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            states, first = dfa
1215e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            arcs = states[state]
1225e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            # Look for a state with this label
1235e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            for i, newstate in arcs:
1245e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                t, v = self.grammar.labels[i]
1255e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                if ilabel == i:
1265e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # Look it up in the list of labels
1275e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    assert t < 256
1285e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # Shift a token; we're done with it
1295e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    self.shift(type, value, newstate, context)
1305e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # Pop while we are in an accept-only state
1315e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    state = newstate
1325e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    while states[state] == [(0, state)]:
1335e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        self.pop()
1345e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        if not self.stack:
1355e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                            # Done parsing!
1365e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                            return True
1375e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        dfa, state, node = self.stack[-1]
1385e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        states, first = dfa
1395e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # Done with this token
1405e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    return False
1415e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                elif t >= 256:
1425e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # See if it's a symbol and if we're in its first set
1435e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    itsdfa = self.grammar.dfas[t]
1445e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    itsstates, itsfirst = itsdfa
1455e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    if ilabel in itsfirst:
1465e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        # Push a symbol
1475e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        self.push(t, self.grammar.dfas[t], newstate, context)
1485e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        break # To continue the outer while loop
1495e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            else:
1505e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                if (0, state) in arcs:
1515e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # An accepting state, pop it and try something else
1525e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    self.pop()
1535e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    if not self.stack:
1545e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        # Done parsing, but another token is input
1555e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                        raise ParseError("too much input",
1565e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                                         type, value, context)
1575e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                else:
1585e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    # No success finding a transition
1595e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                    raise ParseError("bad input", type, value, context)
1605e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1615e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def classify(self, type, value, context):
1625e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Turn a token into a label.  (Internal)"""
1635e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if type == token.NAME:
1645e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            # Keep a listing of all used names
1655e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            self.used_names.add(value)
1665e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            # Check for reserved words
1675e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            ilabel = self.grammar.keywords.get(value)
1685e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            if ilabel is not None:
1695e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                return ilabel
1705e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        ilabel = self.grammar.tokens.get(type)
1715e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if ilabel is None:
1725e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            raise ParseError("bad token", type, value, context)
1735e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        return ilabel
1745e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1755e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def shift(self, type, value, newstate, context):
1765e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Shift a token.  (Internal)"""
1775e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        dfa, state, node = self.stack[-1]
1785e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        newnode = (type, value, context, None)
1795e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        newnode = self.convert(self.grammar, newnode)
1805e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if newnode is not None:
1815e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            node[-1].append(newnode)
1825e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.stack[-1] = (dfa, newstate, node)
1835e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1845e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def push(self, type, newdfa, newstate, context):
1855e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Push a nonterminal.  (Internal)"""
1865e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        dfa, state, node = self.stack[-1]
1875e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        newnode = (type, None, context, [])
1885e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.stack[-1] = (dfa, newstate, node)
1895e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        self.stack.append((newdfa, 0, newnode))
1905e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis
1915e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis    def pop(self):
1925e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        """Pop a nonterminal.  (Internal)"""
1935e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        popdfa, popstate, popnode = self.stack.pop()
1945e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        newnode = self.convert(self.grammar, popnode)
1955e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis        if newnode is not None:
1965e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            if self.stack:
1975e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                dfa, state, node = self.stack[-1]
1985e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                node[-1].append(newnode)
1995e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis            else:
2005e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                self.rootnode = newnode
2015e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis                self.rootnode.used_names = self.used_names
202