1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4# Pgen imports
5from . import grammar, token, tokenize
6
7class PgenGrammar(grammar.Grammar):
8    pass
9
10class ParserGenerator(object):
11
12    def __init__(self, filename, stream=None):
13        close_stream = None
14        if stream is None:
15            stream = open(filename)
16            close_stream = stream.close
17        self.filename = filename
18        self.stream = stream
19        self.generator = tokenize.generate_tokens(stream.readline)
20        self.gettoken() # Initialize lookahead
21        self.dfas, self.startsymbol = self.parse()
22        if close_stream is not None:
23            close_stream()
24        self.first = {} # map from symbol name to set of tokens
25        self.addfirstsets()
26
27    def make_grammar(self):
28        c = PgenGrammar()
29        names = self.dfas.keys()
30        names.sort()
31        names.remove(self.startsymbol)
32        names.insert(0, self.startsymbol)
33        for name in names:
34            i = 256 + len(c.symbol2number)
35            c.symbol2number[name] = i
36            c.number2symbol[i] = name
37        for name in names:
38            dfa = self.dfas[name]
39            states = []
40            for state in dfa:
41                arcs = []
42                for label, next in sorted(state.arcs.iteritems()):
43                    arcs.append((self.make_label(c, label), dfa.index(next)))
44                if state.isfinal:
45                    arcs.append((0, dfa.index(state)))
46                states.append(arcs)
47            c.states.append(states)
48            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
49        c.start = c.symbol2number[self.startsymbol]
50        return c
51
52    def make_first(self, c, name):
53        rawfirst = self.first[name]
54        first = {}
55        for label in sorted(rawfirst):
56            ilabel = self.make_label(c, label)
57            ##assert ilabel not in first # XXX failed on <> ... !=
58            first[ilabel] = 1
59        return first
60
61    def make_label(self, c, label):
62        # XXX Maybe this should be a method on a subclass of converter?
63        ilabel = len(c.labels)
64        if label[0].isalpha():
65            # Either a symbol name or a named token
66            if label in c.symbol2number:
67                # A symbol name (a non-terminal)
68                if label in c.symbol2label:
69                    return c.symbol2label[label]
70                else:
71                    c.labels.append((c.symbol2number[label], None))
72                    c.symbol2label[label] = ilabel
73                    return ilabel
74            else:
75                # A named token (NAME, NUMBER, STRING)
76                itoken = getattr(token, label, None)
77                assert isinstance(itoken, int), label
78                assert itoken in token.tok_name, label
79                if itoken in c.tokens:
80                    return c.tokens[itoken]
81                else:
82                    c.labels.append((itoken, None))
83                    c.tokens[itoken] = ilabel
84                    return ilabel
85        else:
86            # Either a keyword or an operator
87            assert label[0] in ('"', "'"), label
88            value = eval(label)
89            if value[0].isalpha():
90                # A keyword
91                if value in c.keywords:
92                    return c.keywords[value]
93                else:
94                    c.labels.append((token.NAME, value))
95                    c.keywords[value] = ilabel
96                    return ilabel
97            else:
98                # An operator (any non-numeric token)
99                itoken = grammar.opmap[value] # Fails if unknown token
100                if itoken in c.tokens:
101                    return c.tokens[itoken]
102                else:
103                    c.labels.append((itoken, None))
104                    c.tokens[itoken] = ilabel
105                    return ilabel
106
107    def addfirstsets(self):
108        names = self.dfas.keys()
109        names.sort()
110        for name in names:
111            if name not in self.first:
112                self.calcfirst(name)
113            #print name, self.first[name].keys()
114
115    def calcfirst(self, name):
116        dfa = self.dfas[name]
117        self.first[name] = None # dummy to detect left recursion
118        state = dfa[0]
119        totalset = {}
120        overlapcheck = {}
121        for label, next in state.arcs.iteritems():
122            if label in self.dfas:
123                if label in self.first:
124                    fset = self.first[label]
125                    if fset is None:
126                        raise ValueError("recursion for rule %r" % name)
127                else:
128                    self.calcfirst(label)
129                    fset = self.first[label]
130                totalset.update(fset)
131                overlapcheck[label] = fset
132            else:
133                totalset[label] = 1
134                overlapcheck[label] = {label: 1}
135        inverse = {}
136        for label, itsfirst in overlapcheck.iteritems():
137            for symbol in itsfirst:
138                if symbol in inverse:
139                    raise ValueError("rule %s is ambiguous; %s is in the"
140                                     " first sets of %s as well as %s" %
141                                     (name, symbol, label, inverse[symbol]))
142                inverse[symbol] = label
143        self.first[name] = totalset
144
145    def parse(self):
146        dfas = {}
147        startsymbol = None
148        # MSTART: (NEWLINE | RULE)* ENDMARKER
149        while self.type != token.ENDMARKER:
150            while self.type == token.NEWLINE:
151                self.gettoken()
152            # RULE: NAME ':' RHS NEWLINE
153            name = self.expect(token.NAME)
154            self.expect(token.OP, ":")
155            a, z = self.parse_rhs()
156            self.expect(token.NEWLINE)
157            #self.dump_nfa(name, a, z)
158            dfa = self.make_dfa(a, z)
159            #self.dump_dfa(name, dfa)
160            oldlen = len(dfa)
161            self.simplify_dfa(dfa)
162            newlen = len(dfa)
163            dfas[name] = dfa
164            #print name, oldlen, newlen
165            if startsymbol is None:
166                startsymbol = name
167        return dfas, startsymbol
168
169    def make_dfa(self, start, finish):
170        # To turn an NFA into a DFA, we define the states of the DFA
171        # to correspond to *sets* of states of the NFA.  Then do some
172        # state reduction.  Let's represent sets as dicts with 1 for
173        # values.
174        assert isinstance(start, NFAState)
175        assert isinstance(finish, NFAState)
176        def closure(state):
177            base = {}
178            addclosure(state, base)
179            return base
180        def addclosure(state, base):
181            assert isinstance(state, NFAState)
182            if state in base:
183                return
184            base[state] = 1
185            for label, next in state.arcs:
186                if label is None:
187                    addclosure(next, base)
188        states = [DFAState(closure(start), finish)]
189        for state in states: # NB states grows while we're iterating
190            arcs = {}
191            for nfastate in state.nfaset:
192                for label, next in nfastate.arcs:
193                    if label is not None:
194                        addclosure(next, arcs.setdefault(label, {}))
195            for label, nfaset in sorted(arcs.iteritems()):
196                for st in states:
197                    if st.nfaset == nfaset:
198                        break
199                else:
200                    st = DFAState(nfaset, finish)
201                    states.append(st)
202                state.addarc(st, label)
203        return states # List of DFAState instances; first one is start
204
205    def dump_nfa(self, name, start, finish):
206        print "Dump of NFA for", name
207        todo = [start]
208        for i, state in enumerate(todo):
209            print "  State", i, state is finish and "(final)" or ""
210            for label, next in state.arcs:
211                if next in todo:
212                    j = todo.index(next)
213                else:
214                    j = len(todo)
215                    todo.append(next)
216                if label is None:
217                    print "    -> %d" % j
218                else:
219                    print "    %s -> %d" % (label, j)
220
221    def dump_dfa(self, name, dfa):
222        print "Dump of DFA for", name
223        for i, state in enumerate(dfa):
224            print "  State", i, state.isfinal and "(final)" or ""
225            for label, next in sorted(state.arcs.iteritems()):
226                print "    %s -> %d" % (label, dfa.index(next))
227
228    def simplify_dfa(self, dfa):
229        # This is not theoretically optimal, but works well enough.
230        # Algorithm: repeatedly look for two states that have the same
231        # set of arcs (same labels pointing to the same nodes) and
232        # unify them, until things stop changing.
233
234        # dfa is a list of DFAState instances
235        changes = True
236        while changes:
237            changes = False
238            for i, state_i in enumerate(dfa):
239                for j in range(i+1, len(dfa)):
240                    state_j = dfa[j]
241                    if state_i == state_j:
242                        #print "  unify", i, j
243                        del dfa[j]
244                        for state in dfa:
245                            state.unifystate(state_j, state_i)
246                        changes = True
247                        break
248
249    def parse_rhs(self):
250        # RHS: ALT ('|' ALT)*
251        a, z = self.parse_alt()
252        if self.value != "|":
253            return a, z
254        else:
255            aa = NFAState()
256            zz = NFAState()
257            aa.addarc(a)
258            z.addarc(zz)
259            while self.value == "|":
260                self.gettoken()
261                a, z = self.parse_alt()
262                aa.addarc(a)
263                z.addarc(zz)
264            return aa, zz
265
266    def parse_alt(self):
267        # ALT: ITEM+
268        a, b = self.parse_item()
269        while (self.value in ("(", "[") or
270               self.type in (token.NAME, token.STRING)):
271            c, d = self.parse_item()
272            b.addarc(c)
273            b = d
274        return a, b
275
276    def parse_item(self):
277        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
278        if self.value == "[":
279            self.gettoken()
280            a, z = self.parse_rhs()
281            self.expect(token.OP, "]")
282            a.addarc(z)
283            return a, z
284        else:
285            a, z = self.parse_atom()
286            value = self.value
287            if value not in ("+", "*"):
288                return a, z
289            self.gettoken()
290            z.addarc(a)
291            if value == "+":
292                return a, z
293            else:
294                return a, a
295
296    def parse_atom(self):
297        # ATOM: '(' RHS ')' | NAME | STRING
298        if self.value == "(":
299            self.gettoken()
300            a, z = self.parse_rhs()
301            self.expect(token.OP, ")")
302            return a, z
303        elif self.type in (token.NAME, token.STRING):
304            a = NFAState()
305            z = NFAState()
306            a.addarc(z, self.value)
307            self.gettoken()
308            return a, z
309        else:
310            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
311                             self.type, self.value)
312
313    def expect(self, type, value=None):
314        if self.type != type or (value is not None and self.value != value):
315            self.raise_error("expected %s/%s, got %s/%s",
316                             type, value, self.type, self.value)
317        value = self.value
318        self.gettoken()
319        return value
320
321    def gettoken(self):
322        tup = self.generator.next()
323        while tup[0] in (tokenize.COMMENT, tokenize.NL):
324            tup = self.generator.next()
325        self.type, self.value, self.begin, self.end, self.line = tup
326        #print token.tok_name[self.type], repr(self.value)
327
328    def raise_error(self, msg, *args):
329        if args:
330            try:
331                msg = msg % args
332            except:
333                msg = " ".join([msg] + map(str, args))
334        raise SyntaxError(msg, (self.filename, self.end[0],
335                                self.end[1], self.line))
336
337class NFAState(object):
338
339    def __init__(self):
340        self.arcs = [] # list of (label, NFAState) pairs
341
342    def addarc(self, next, label=None):
343        assert label is None or isinstance(label, str)
344        assert isinstance(next, NFAState)
345        self.arcs.append((label, next))
346
347class DFAState(object):
348
349    def __init__(self, nfaset, final):
350        assert isinstance(nfaset, dict)
351        assert isinstance(iter(nfaset).next(), NFAState)
352        assert isinstance(final, NFAState)
353        self.nfaset = nfaset
354        self.isfinal = final in nfaset
355        self.arcs = {} # map from label to DFAState
356
357    def addarc(self, next, label):
358        assert isinstance(label, str)
359        assert label not in self.arcs
360        assert isinstance(next, DFAState)
361        self.arcs[label] = next
362
363    def unifystate(self, old, new):
364        for label, next in self.arcs.iteritems():
365            if next is old:
366                self.arcs[label] = new
367
368    def __eq__(self, other):
369        # Equality test -- ignore the nfaset instance variable
370        assert isinstance(other, DFAState)
371        if self.isfinal != other.isfinal:
372            return False
373        # Can't just return self.arcs == other.arcs, because that
374        # would invoke this method recursively, with cycles...
375        if len(self.arcs) != len(other.arcs):
376            return False
377        for label, next in self.arcs.iteritems():
378            if next is not other.arcs.get(label):
379                return False
380        return True
381
382    __hash__ = None # For Py3 compatibility.
383
384def generate_grammar(filename="Grammar.txt"):
385    p = ParserGenerator(filename)
386    return p.make_grammar()
387