1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""Convert graminit.[ch] spit out by pgen to Python code.
5
6Pgen is the Python parser generator.  It is useful to quickly create a
7parser from a grammar file in Python's grammar notation.  But I don't
8want my parsers to be written in C (yet), so I'm translating the
9parsing tables to Python data structures and writing a Python parse
10engine.
11
12Note that the token numbers are constants determined by the standard
13Python tokenizer.  The standard token module defines these numbers and
14their names (the names are not used much).  The token numbers are
15hardcoded into the Python tokenizer and into pgen.  A Python
16implementation of the Python tokenizer is also available, in the
17standard tokenize module.
18
19On the other hand, symbol numbers (representing the grammar's
20non-terminals) are assigned by pgen based on the actual grammar
21input.
22
23Note: this module is pretty much obsolete; the pgen module generates
24equivalent grammar tables directly from the Grammar.txt input file
25without having to invoke the Python pgen C program.
26
27"""
28
29# Python imports
30import re
31
32# Local imports
33from pgen2 import grammar, token
34
35
36class Converter(grammar.Grammar):
37    """Grammar subclass that reads classic pgen output files.
38
39    The run() method reads the tables as produced by the pgen parser
40    generator, typically contained in two C files, graminit.h and
41    graminit.c.  The other methods are for internal use only.
42
43    See the base class for more documentation.
44
45    """
46
47    def run(self, graminit_h, graminit_c):
48        """Load the grammar tables from the text files written by pgen."""
49        self.parse_graminit_h(graminit_h)
50        self.parse_graminit_c(graminit_c)
51        self.finish_off()
52
53    def parse_graminit_h(self, filename):
54        """Parse the .h file written by pgen.  (Internal)
55
56        This file is a sequence of #define statements defining the
57        nonterminals of the grammar as numbers.  We build two tables
58        mapping the numbers to names and back.
59
60        """
61        try:
62            f = open(filename)
63        except IOError, err:
64            print "Can't open %s: %s" % (filename, err)
65            return False
66        self.symbol2number = {}
67        self.number2symbol = {}
68        lineno = 0
69        for line in f:
70            lineno += 1
71            mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
72            if not mo and line.strip():
73                print "%s(%s): can't parse %s" % (filename, lineno,
74                                                  line.strip())
75            else:
76                symbol, number = mo.groups()
77                number = int(number)
78                assert symbol not in self.symbol2number
79                assert number not in self.number2symbol
80                self.symbol2number[symbol] = number
81                self.number2symbol[number] = symbol
82        return True
83
84    def parse_graminit_c(self, filename):
85        """Parse the .c file written by pgen.  (Internal)
86
87        The file looks as follows.  The first two lines are always this:
88
89        #include "pgenheaders.h"
90        #include "grammar.h"
91
92        After that come four blocks:
93
94        1) one or more state definitions
95        2) a table defining dfas
96        3) a table defining labels
97        4) a struct defining the grammar
98
99        A state definition has the following form:
100        - one or more arc arrays, each of the form:
101          static arc arcs_<n>_<m>[<k>] = {
102                  {<i>, <j>},
103                  ...
104          };
105        - followed by a state array, of the form:
106          static state states_<s>[<t>] = {
107                  {<k>, arcs_<n>_<m>},
108                  ...
109          };
110
111        """
112        try:
113            f = open(filename)
114        except IOError, err:
115            print "Can't open %s: %s" % (filename, err)
116            return False
117        # The code below essentially uses f's iterator-ness!
118        lineno = 0
119
120        # Expect the two #include lines
121        lineno, line = lineno+1, f.next()
122        assert line == '#include "pgenheaders.h"\n', (lineno, line)
123        lineno, line = lineno+1, f.next()
124        assert line == '#include "grammar.h"\n', (lineno, line)
125
126        # Parse the state definitions
127        lineno, line = lineno+1, f.next()
128        allarcs = {}
129        states = []
130        while line.startswith("static arc "):
131            while line.startswith("static arc "):
132                mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
133                              line)
134                assert mo, (lineno, line)
135                n, m, k = map(int, mo.groups())
136                arcs = []
137                for _ in range(k):
138                    lineno, line = lineno+1, f.next()
139                    mo = re.match(r"\s+{(\d+), (\d+)},$", line)
140                    assert mo, (lineno, line)
141                    i, j = map(int, mo.groups())
142                    arcs.append((i, j))
143                lineno, line = lineno+1, f.next()
144                assert line == "};\n", (lineno, line)
145                allarcs[(n, m)] = arcs
146                lineno, line = lineno+1, f.next()
147            mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
148            assert mo, (lineno, line)
149            s, t = map(int, mo.groups())
150            assert s == len(states), (lineno, line)
151            state = []
152            for _ in range(t):
153                lineno, line = lineno+1, f.next()
154                mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
155                assert mo, (lineno, line)
156                k, n, m = map(int, mo.groups())
157                arcs = allarcs[n, m]
158                assert k == len(arcs), (lineno, line)
159                state.append(arcs)
160            states.append(state)
161            lineno, line = lineno+1, f.next()
162            assert line == "};\n", (lineno, line)
163            lineno, line = lineno+1, f.next()
164        self.states = states
165
166        # Parse the dfas
167        dfas = {}
168        mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
169        assert mo, (lineno, line)
170        ndfas = int(mo.group(1))
171        for i in range(ndfas):
172            lineno, line = lineno+1, f.next()
173            mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
174                          line)
175            assert mo, (lineno, line)
176            symbol = mo.group(2)
177            number, x, y, z = map(int, mo.group(1, 3, 4, 5))
178            assert self.symbol2number[symbol] == number, (lineno, line)
179            assert self.number2symbol[number] == symbol, (lineno, line)
180            assert x == 0, (lineno, line)
181            state = states[z]
182            assert y == len(state), (lineno, line)
183            lineno, line = lineno+1, f.next()
184            mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
185            assert mo, (lineno, line)
186            first = {}
187            rawbitset = eval(mo.group(1))
188            for i, c in enumerate(rawbitset):
189                byte = ord(c)
190                for j in range(8):
191                    if byte & (1<<j):
192                        first[i*8 + j] = 1
193            dfas[number] = (state, first)
194        lineno, line = lineno+1, f.next()
195        assert line == "};\n", (lineno, line)
196        self.dfas = dfas
197
198        # Parse the labels
199        labels = []
200        lineno, line = lineno+1, f.next()
201        mo = re.match(r"static label labels\[(\d+)\] = {$", line)
202        assert mo, (lineno, line)
203        nlabels = int(mo.group(1))
204        for i in range(nlabels):
205            lineno, line = lineno+1, f.next()
206            mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
207            assert mo, (lineno, line)
208            x, y = mo.groups()
209            x = int(x)
210            if y == "0":
211                y = None
212            else:
213                y = eval(y)
214            labels.append((x, y))
215        lineno, line = lineno+1, f.next()
216        assert line == "};\n", (lineno, line)
217        self.labels = labels
218
219        # Parse the grammar struct
220        lineno, line = lineno+1, f.next()
221        assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
222        lineno, line = lineno+1, f.next()
223        mo = re.match(r"\s+(\d+),$", line)
224        assert mo, (lineno, line)
225        ndfas = int(mo.group(1))
226        assert ndfas == len(self.dfas)
227        lineno, line = lineno+1, f.next()
228        assert line == "\tdfas,\n", (lineno, line)
229        lineno, line = lineno+1, f.next()
230        mo = re.match(r"\s+{(\d+), labels},$", line)
231        assert mo, (lineno, line)
232        nlabels = int(mo.group(1))
233        assert nlabels == len(self.labels), (lineno, line)
234        lineno, line = lineno+1, f.next()
235        mo = re.match(r"\s+(\d+)$", line)
236        assert mo, (lineno, line)
237        start = int(mo.group(1))
238        assert start in self.number2symbol, (lineno, line)
239        self.start = start
240        lineno, line = lineno+1, f.next()
241        assert line == "};\n", (lineno, line)
242        try:
243            lineno, line = lineno+1, f.next()
244        except StopIteration:
245            pass
246        else:
247            assert 0, (lineno, line)
248
249    def finish_off(self):
250        """Create additional useful structures.  (Internal)."""
251        self.keywords = {} # map from keyword strings to arc labels
252        self.tokens = {}   # map from numeric token values to arc labels
253        for ilabel, (type, value) in enumerate(self.labels):
254            if type == token.NAME and value is not None:
255                self.keywords[value] = ilabel
256            elif value is None:
257                self.tokens[type] = ilabel
258