1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3 4"""This module defines the data structures used to represent a grammar. 5 6These are a bit arcane because they are derived from the data 7structures used by Python's 'pgen' parser generator. 8 9There's also a table here mapping operators to their names in the 10token module; the Python tokenize module reports all operators as the 11fallback token code OP, but the parser needs the actual token code. 12 13""" 14 15# Python imports 16import pickle 17 18# Local imports 19from . import token, tokenize 20 21 22class Grammar(object): 23 """Pgen parsing tables conversion class. 24 25 Once initialized, this class supplies the grammar tables for the 26 parsing engine implemented by parse.py. The parsing engine 27 accesses the instance variables directly. The class here does not 28 provide initialization of the tables; several subclasses exist to 29 do this (see the conv and pgen modules). 30 31 The load() method reads the tables from a pickle file, which is 32 much faster than the other ways offered by subclasses. The pickle 33 file is written by calling dump() (after loading the grammar 34 tables using a subclass). The report() method prints a readable 35 representation of the tables to stdout, for debugging. 36 37 The instance variables are as follows: 38 39 symbol2number -- a dict mapping symbol names to numbers. Symbol 40 numbers are always 256 or higher, to distinguish 41 them from token numbers, which are between 0 and 42 255 (inclusive). 43 44 number2symbol -- a dict mapping numbers to symbol names; 45 these two are each other's inverse. 46 47 states -- a list of DFAs, where each DFA is a list of 48 states, each state is a list of arcs, and each 49 arc is a (i, j) pair where i is a label and j is 50 a state number. The DFA number is the index into 51 this list. (This name is slightly confusing.) 52 Final states are represented by a special arc of 53 the form (0, j) where j is its own state number. 54 55 dfas -- a dict mapping symbol numbers to (DFA, first) 56 pairs, where DFA is an item from the states list 57 above, and first is a set of tokens that can 58 begin this grammar rule (represented by a dict 59 whose values are always 1). 60 61 labels -- a list of (x, y) pairs where x is either a token 62 number or a symbol number, and y is either None 63 or a string; the strings are keywords. The label 64 number is the index in this list; label numbers 65 are used to mark state transitions (arcs) in the 66 DFAs. 67 68 start -- the number of the grammar's start symbol. 69 70 keywords -- a dict mapping keyword strings to arc labels. 71 72 tokens -- a dict mapping token numbers to arc labels. 73 74 """ 75 76 def __init__(self): 77 self.symbol2number = {} 78 self.number2symbol = {} 79 self.states = [] 80 self.dfas = {} 81 self.labels = [(0, "EMPTY")] 82 self.keywords = {} 83 self.tokens = {} 84 self.symbol2label = {} 85 self.start = 256 86 87 def dump(self, filename): 88 """Dump the grammar tables to a pickle file.""" 89 f = open(filename, "wb") 90 pickle.dump(self.__dict__, f, 2) 91 f.close() 92 93 def load(self, filename): 94 """Load the grammar tables from a pickle file.""" 95 f = open(filename, "rb") 96 d = pickle.load(f) 97 f.close() 98 self.__dict__.update(d) 99 100 def copy(self): 101 """ 102 Copy the grammar. 103 """ 104 new = self.__class__() 105 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 106 "tokens", "symbol2label"): 107 setattr(new, dict_attr, getattr(self, dict_attr).copy()) 108 new.labels = self.labels[:] 109 new.states = self.states[:] 110 new.start = self.start 111 return new 112 113 def report(self): 114 """Dump the grammar tables to standard output, for debugging.""" 115 from pprint import pprint 116 print "s2n" 117 pprint(self.symbol2number) 118 print "n2s" 119 pprint(self.number2symbol) 120 print "states" 121 pprint(self.states) 122 print "dfas" 123 pprint(self.dfas) 124 print "labels" 125 pprint(self.labels) 126 print "start", self.start 127 128 129# Map from operator to number (since tokenize doesn't do this) 130 131opmap_raw = """ 132( LPAR 133) RPAR 134[ LSQB 135] RSQB 136: COLON 137, COMMA 138; SEMI 139+ PLUS 140- MINUS 141* STAR 142/ SLASH 143| VBAR 144& AMPER 145< LESS 146> GREATER 147= EQUAL 148. DOT 149% PERCENT 150` BACKQUOTE 151{ LBRACE 152} RBRACE 153@ AT 154== EQEQUAL 155!= NOTEQUAL 156<> NOTEQUAL 157<= LESSEQUAL 158>= GREATEREQUAL 159~ TILDE 160^ CIRCUMFLEX 161<< LEFTSHIFT 162>> RIGHTSHIFT 163** DOUBLESTAR 164+= PLUSEQUAL 165-= MINEQUAL 166*= STAREQUAL 167/= SLASHEQUAL 168%= PERCENTEQUAL 169&= AMPEREQUAL 170|= VBAREQUAL 171^= CIRCUMFLEXEQUAL 172<<= LEFTSHIFTEQUAL 173>>= RIGHTSHIFTEQUAL 174**= DOUBLESTAREQUAL 175// DOUBLESLASH 176//= DOUBLESLASHEQUAL 177-> RARROW 178""" 179 180opmap = {} 181for line in opmap_raw.splitlines(): 182 if line: 183 op, name = line.split() 184 opmap[op] = getattr(token, name) 185