15e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 25e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Licensed to PSF under a Contributor Agreement. 35e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 45e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis"""Parser engine for the grammar tables generated by pgen. 55e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 65e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. LöwisThe grammar table must be loaded first. 75e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 85e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. LöwisSee Parser/parser.c in the Python distribution for additional info on 95e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwishow this parsing engine works. 105e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 115e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis""" 125e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 135e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis# Local imports 145e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisfrom . import token 155e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 165e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisclass ParseError(Exception): 175e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Exception to signal the parser is stuck.""" 185e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 195e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def __init__(self, msg, type, value, context): 205e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % 215e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis (msg, type, value, context)) 225e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.msg = msg 235e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.type = type 245e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.value = value 255e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.context = context 265e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 275e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwisclass Parser(object): 285e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Parser engine. 295e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 305e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis The proper usage sequence is: 315e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 325e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis p = Parser(grammar, [converter]) # create instance 335e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis p.setup([start]) # prepare for parsing 345e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis <for each input token>: 355e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if p.addtoken(...): # parse a token; may raise ParseError 365e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis break 375e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis root = p.rootnode # root of abstract syntax tree 385e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 395e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis A Parser instance may be reused by calling setup() repeatedly. 405e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 415e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis A Parser instance contains state pertaining to the current token 425e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis sequence, and should not be used concurrently by different threads 435e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis to parse separate token sequences. 445e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 455e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis See driver.py for how to get input tokens by tokenizing a file or 465e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis string. 475e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 485e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis Parsing is complete when addtoken() returns True; the root of the 495e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis abstract syntax tree can then be retrieved from the rootnode 505e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis instance variable. When a syntax error occurs, addtoken() raises 515e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis the ParseError exception. There is no error recovery; the parser 525e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis cannot be used after a syntax error was reported (but it can be 535e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis reinitialized by calling setup()). 545e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 555e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """ 565e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 575e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def __init__(self, grammar, convert=None): 585e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Constructor. 595e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 605e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis The grammar argument is a grammar.Grammar instance; see the 615e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis grammar module for more information. 625e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 635e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis The parser is not ready yet for parsing; you must call the 645e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis setup() method to get it started. 655e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 665e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis The optional convert argument is a function mapping concrete 675e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis syntax tree nodes to abstract syntax tree nodes. If not 685e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis given, no conversion is done and the syntax tree produced is 695e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis the concrete syntax tree. If given, it must be a function of 705e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis two arguments, the first being the grammar (a grammar.Grammar 715e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis instance), and the second being the concrete syntax tree node 725e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis to be converted. The syntax tree is converted from the bottom 735e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis up. 745e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 755e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis A concrete syntax tree node is a (type, value, context, nodes) 765e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis tuple, where type is the node type (a token or symbol number), 775e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis value is None for symbols and a string for tokens, context is 785e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis None or an opaque value used for error reporting (typically a 795e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis (lineno, offset) pair), and nodes is a list of children for 805e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis symbols, and None for tokens. 815e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 825e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis An abstract syntax tree node may be anything; this is entirely 835e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis up to the converter function. 845e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 855e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """ 865e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.grammar = grammar 875e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.convert = convert or (lambda grammar, node: node) 885e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 895e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def setup(self, start=None): 905e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Prepare for parsing. 915e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 925e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis This *must* be called before starting to parse. 935e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 945e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis The optional argument is an alternative start symbol; it 955e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis defaults to the grammar's start symbol. 965e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 975e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis You can use a Parser instance to parse any number of programs; 985e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis each time you call setup() the parser is reset to an initial 995e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis state determined by the (implicit or explicit) start symbol. 1005e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1015e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """ 1025e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if start is None: 1035e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis start = self.grammar.start 1045e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Each stack entry is a tuple: (dfa, state, node). 1055e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # A node is a tuple: (type, value, context, children), 1065e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # where children is a list of nodes or None, and context may be None. 1075e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis newnode = (start, None, None, []) 1085e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis stackentry = (self.grammar.dfas[start], 0, newnode) 1095e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.stack = [stackentry] 1105e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.rootnode = None 1115e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.used_names = set() # Aliased to self.rootnode.used_names in pop() 1125e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1135e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def addtoken(self, type, value, context): 1145e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Add a token; return True iff this is the end of the program.""" 1155e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Map from token to label 1165e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis ilabel = self.classify(type, value, context) 1175e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Loop until the token is shifted; may raise exceptions 1185e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis while True: 1195e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis dfa, state, node = self.stack[-1] 1205e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis states, first = dfa 1215e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis arcs = states[state] 1225e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Look for a state with this label 1235e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis for i, newstate in arcs: 1245e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis t, v = self.grammar.labels[i] 1255e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if ilabel == i: 1265e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Look it up in the list of labels 1275e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis assert t < 256 1285e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Shift a token; we're done with it 1295e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.shift(type, value, newstate, context) 1305e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Pop while we are in an accept-only state 1315e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis state = newstate 1325e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis while states[state] == [(0, state)]: 1335e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.pop() 1345e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if not self.stack: 1355e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Done parsing! 1365e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis return True 1375e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis dfa, state, node = self.stack[-1] 1385e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis states, first = dfa 1395e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Done with this token 1405e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis return False 1415e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis elif t >= 256: 1425e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # See if it's a symbol and if we're in its first set 1435e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis itsdfa = self.grammar.dfas[t] 1445e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis itsstates, itsfirst = itsdfa 1455e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if ilabel in itsfirst: 1465e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Push a symbol 1475e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.push(t, self.grammar.dfas[t], newstate, context) 1485e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis break # To continue the outer while loop 1495e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis else: 1505e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if (0, state) in arcs: 1515e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # An accepting state, pop it and try something else 1525e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.pop() 1535e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if not self.stack: 1545e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Done parsing, but another token is input 1555e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis raise ParseError("too much input", 1565e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis type, value, context) 1575e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis else: 1585e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # No success finding a transition 1595e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis raise ParseError("bad input", type, value, context) 1605e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1615e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def classify(self, type, value, context): 1625e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Turn a token into a label. (Internal)""" 1635e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if type == token.NAME: 1645e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Keep a listing of all used names 1655e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.used_names.add(value) 1665e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis # Check for reserved words 1675e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis ilabel = self.grammar.keywords.get(value) 1685e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if ilabel is not None: 1695e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis return ilabel 1705e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis ilabel = self.grammar.tokens.get(type) 1715e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if ilabel is None: 1725e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis raise ParseError("bad token", type, value, context) 1735e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis return ilabel 1745e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1755e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def shift(self, type, value, newstate, context): 1765e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Shift a token. (Internal)""" 1775e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis dfa, state, node = self.stack[-1] 1785e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis newnode = (type, value, context, None) 1795e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis newnode = self.convert(self.grammar, newnode) 1805e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if newnode is not None: 1815e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis node[-1].append(newnode) 1825e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.stack[-1] = (dfa, newstate, node) 1835e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1845e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def push(self, type, newdfa, newstate, context): 1855e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Push a nonterminal. (Internal)""" 1865e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis dfa, state, node = self.stack[-1] 1875e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis newnode = (type, None, context, []) 1885e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.stack[-1] = (dfa, newstate, node) 1895e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.stack.append((newdfa, 0, newnode)) 1905e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis 1915e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis def pop(self): 1925e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis """Pop a nonterminal. (Internal)""" 1935e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis popdfa, popstate, popnode = self.stack.pop() 1945e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis newnode = self.convert(self.grammar, popnode) 1955e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if newnode is not None: 1965e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis if self.stack: 1975e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis dfa, state, node = self.stack[-1] 1985e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis node[-1].append(newnode) 1995e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis else: 2005e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.rootnode = newnode 2015e37baea8007cb64b65a180e4d6c80de292a8a4aMartin v. Löwis self.rootnode.used_names = self.used_names 202