1#=======================================================================
2#
3#     Python Lexical Analyser
4#
5#     Regular Expressions
6#
7#=======================================================================
8
9import types
10from sys import maxint as maxint
11
12import Errors
13
14#
15#     Constants
16#
17
18BOL = 'bol'
19EOL = 'eol'
20EOF = 'eof'
21
22nl_code = ord('\n')
23
24#
25#     Helper functions
26#
27
28def chars_to_ranges(s):
29    """
30    Return a list of character codes consisting of pairs
31    [code1a, code1b, code2a, code2b,...] which cover all
32    the characters in |s|.
33    """
34    char_list = list(s)
35    char_list.sort()
36    i = 0
37    n = len(char_list)
38    result = []
39    while i < n:
40        code1 = ord(char_list[i])
41        code2 = code1 + 1
42        i = i + 1
43        while i < n and code2 >= ord(char_list[i]):
44            code2 = code2 + 1
45            i = i + 1
46        result.append(code1)
47        result.append(code2)
48    return result
49
50def uppercase_range(code1, code2):
51    """
52    If the range of characters from code1 to code2-1 includes any
53    lower case letters, return the corresponding upper case range.
54    """
55    code3 = max(code1, ord('a'))
56    code4 = min(code2, ord('z') + 1)
57    if code3 < code4:
58        d = ord('A') - ord('a')
59        return (code3 + d, code4 + d)
60    else:
61        return None
62
63def lowercase_range(code1, code2):
64    """
65    If the range of characters from code1 to code2-1 includes any
66    upper case letters, return the corresponding lower case range.
67    """
68    code3 = max(code1, ord('A'))
69    code4 = min(code2, ord('Z') + 1)
70    if code3 < code4:
71        d = ord('a') - ord('A')
72        return (code3 + d, code4 + d)
73    else:
74        return None
75
76def CodeRanges(code_list):
77    """
78    Given a list of codes as returned by chars_to_ranges, return
79    an RE which will match a character in any of the ranges.
80    """
81    re_list = []
82    for i in xrange(0, len(code_list), 2):
83        re_list.append(CodeRange(code_list[i], code_list[i + 1]))
84    return Alt(*re_list)
85
86def CodeRange(code1, code2):
87    """
88    CodeRange(code1, code2) is an RE which matches any character
89    with a code |c| in the range |code1| <= |c| < |code2|.
90    """
91    if code1 <= nl_code < code2:
92        return Alt(RawCodeRange(code1, nl_code),
93                             RawNewline,
94                             RawCodeRange(nl_code + 1, code2))
95    else:
96        return RawCodeRange(code1, code2)
97
98#
99#     Abstract classes
100#
101
102class RE(object):
103    """RE is the base class for regular expression constructors.
104    The following operators are defined on REs:
105
106         re1 + re2         is an RE which matches |re1| followed by |re2|
107         re1 | re2         is an RE which matches either |re1| or |re2|
108    """
109
110    nullable = 1 # True if this RE can match 0 input symbols
111    match_nl = 1 # True if this RE can match a string ending with '\n'
112    str = None     # Set to a string to override the class's __str__ result
113
114    def build_machine(self, machine, initial_state, final_state,
115                                        match_bol, nocase):
116        """
117        This method should add states to |machine| to implement this
118        RE, starting at |initial_state| and ending at |final_state|.
119        If |match_bol| is true, the RE must be able to match at the
120        beginning of a line. If nocase is true, upper and lower case
121        letters should be treated as equivalent.
122        """
123        raise NotImplementedError("%s.build_machine not implemented" %
124            self.__class__.__name__)
125
126    def build_opt(self, m, initial_state, c):
127        """
128        Given a state |s| of machine |m|, return a new state
129        reachable from |s| on character |c| or epsilon.
130        """
131        s = m.new_state()
132        initial_state.link_to(s)
133        initial_state.add_transition(c, s)
134        return s
135
136    def __add__(self, other):
137        return Seq(self, other)
138
139    def __or__(self, other):
140        return Alt(self, other)
141
142    def __str__(self):
143        if self.str:
144            return self.str
145        else:
146            return self.calc_str()
147
148    def check_re(self, num, value):
149        if not isinstance(value, RE):
150            self.wrong_type(num, value, "Plex.RE instance")
151
152    def check_string(self, num, value):
153        if type(value) != type(''):
154            self.wrong_type(num, value, "string")
155
156    def check_char(self, num, value):
157        self.check_string(num, value)
158        if len(value) != 1:
159            raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
160                "Expected a string of length 1, got: %s" % (
161                    num, self.__class__.__name__, repr(value)))
162
163    def wrong_type(self, num, value, expected):
164        if type(value) == types.InstanceType:
165                got = "%s.%s instance" % (
166                    value.__class__.__module__, value.__class__.__name__)
167        else:
168            got = type(value).__name__
169        raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
170                                        "(expected %s, got %s" % (
171                                            num, self.__class__.__name__, expected, got))
172
173#
174#     Primitive RE constructors
175#     -------------------------
176#
177#     These are the basic REs from which all others are built.
178#
179
180## class Char(RE):
181##     """
182##     Char(c) is an RE which matches the character |c|.
183##     """
184
185##     nullable = 0
186
187##     def __init__(self, char):
188##         self.char = char
189##         self.match_nl = char == '\n'
190
191##     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
192##         c = self.char
193##         if match_bol and c != BOL:
194##             s1 = self.build_opt(m, initial_state, BOL)
195##         else:
196##             s1 = initial_state
197##         if c == '\n' or c == EOF:
198##             s1 = self.build_opt(m, s1, EOL)
199##         if len(c) == 1:
200##             code = ord(self.char)
201##             s1.add_transition((code, code+1), final_state)
202##             if nocase and is_letter_code(code):
203##                 code2 = other_case_code(code)
204##                 s1.add_transition((code2, code2+1), final_state)
205##         else:
206##             s1.add_transition(c, final_state)
207
208##     def calc_str(self):
209##         return "Char(%s)" % repr(self.char)
210
211def Char(c):
212    """
213    Char(c) is an RE which matches the character |c|.
214    """
215    if len(c) == 1:
216        result = CodeRange(ord(c), ord(c) + 1)
217    else:
218        result = SpecialSymbol(c)
219    result.str = "Char(%s)" % repr(c)
220    return result
221
222class RawCodeRange(RE):
223    """
224    RawCodeRange(code1, code2) is a low-level RE which matches any character
225    with a code |c| in the range |code1| <= |c| < |code2|, where the range
226    does not include newline. For internal use only.
227    """
228    nullable = 0
229    match_nl = 0
230    range = None                     # (code, code)
231    uppercase_range = None # (code, code) or None
232    lowercase_range = None # (code, code) or None
233
234    def __init__(self, code1, code2):
235        self.range = (code1, code2)
236        self.uppercase_range = uppercase_range(code1, code2)
237        self.lowercase_range = lowercase_range(code1, code2)
238
239    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
240        if match_bol:
241            initial_state = self.build_opt(m, initial_state, BOL)
242        initial_state.add_transition(self.range, final_state)
243        if nocase:
244            if self.uppercase_range:
245                initial_state.add_transition(self.uppercase_range, final_state)
246            if self.lowercase_range:
247                initial_state.add_transition(self.lowercase_range, final_state)
248
249    def calc_str(self):
250        return "CodeRange(%d,%d)" % (self.code1, self.code2)
251
252class _RawNewline(RE):
253    """
254    RawNewline is a low-level RE which matches a newline character.
255    For internal use only.
256    """
257    nullable = 0
258    match_nl = 1
259
260    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
261        if match_bol:
262            initial_state = self.build_opt(m, initial_state, BOL)
263        s = self.build_opt(m, initial_state, EOL)
264        s.add_transition((nl_code, nl_code + 1), final_state)
265
266RawNewline = _RawNewline()
267
268
269class SpecialSymbol(RE):
270    """
271    SpecialSymbol(sym) is an RE which matches the special input
272    symbol |sym|, which is one of BOL, EOL or EOF.
273    """
274    nullable = 0
275    match_nl = 0
276    sym = None
277
278    def __init__(self, sym):
279        self.sym = sym
280
281    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
282        # Sequences 'bol bol' and 'bol eof' are impossible, so only need
283        # to allow for bol if sym is eol
284        if match_bol and self.sym == EOL:
285            initial_state = self.build_opt(m, initial_state, BOL)
286        initial_state.add_transition(self.sym, final_state)
287
288
289class Seq(RE):
290    """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
291    |re2| followed by |re3|..."""
292
293    def __init__(self, *re_list):
294        nullable = 1
295        for i in xrange(len(re_list)):
296            re = re_list[i]
297            self.check_re(i, re)
298            nullable = nullable and re.nullable
299        self.re_list = re_list
300        self.nullable = nullable
301        i = len(re_list)
302        match_nl = 0
303        while i:
304            i = i - 1
305            re = re_list[i]
306            if re.match_nl:
307                match_nl = 1
308                break
309            if not re.nullable:
310                break
311        self.match_nl = match_nl
312
313    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
314        re_list = self.re_list
315        if len(re_list) == 0:
316            initial_state.link_to(final_state)
317        else:
318            s1 = initial_state
319            n = len(re_list)
320            for i in xrange(n):
321                if i < n - 1:
322                    s2 = m.new_state()
323                else:
324                    s2 = final_state
325                re = re_list[i]
326                re.build_machine(m, s1, s2, match_bol, nocase)
327                s1 = s2
328                match_bol = re.match_nl or (match_bol and re.nullable)
329
330    def calc_str(self):
331        return "Seq(%s)" % ','.join(map(str, self.re_list))
332
333
334class Alt(RE):
335    """Alt(re1, re2, re3...) is an RE which matches either |re1| or
336    |re2| or |re3|..."""
337
338    def __init__(self, *re_list):
339        self.re_list = re_list
340        nullable = 0
341        match_nl = 0
342        nullable_res = []
343        non_nullable_res = []
344        i = 1
345        for re in re_list:
346            self.check_re(i, re)
347            if re.nullable:
348                nullable_res.append(re)
349                nullable = 1
350            else:
351                non_nullable_res.append(re)
352            if re.match_nl:
353                match_nl = 1
354            i = i + 1
355        self.nullable_res = nullable_res
356        self.non_nullable_res = non_nullable_res
357        self.nullable = nullable
358        self.match_nl = match_nl
359
360    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
361        for re in self.nullable_res:
362            re.build_machine(m, initial_state, final_state, match_bol, nocase)
363        if self.non_nullable_res:
364            if match_bol:
365                initial_state = self.build_opt(m, initial_state, BOL)
366            for re in self.non_nullable_res:
367                re.build_machine(m, initial_state, final_state, 0, nocase)
368
369    def calc_str(self):
370        return "Alt(%s)" % ','.join(map(str, self.re_list))
371
372
373class Rep1(RE):
374    """Rep1(re) is an RE which matches one or more repetitions of |re|."""
375
376    def __init__(self, re):
377        self.check_re(1, re)
378        self.re = re
379        self.nullable = re.nullable
380        self.match_nl = re.match_nl
381
382    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
383        s1 = m.new_state()
384        s2 = m.new_state()
385        initial_state.link_to(s1)
386        self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
387        s2.link_to(s1)
388        s2.link_to(final_state)
389
390    def calc_str(self):
391        return "Rep1(%s)" % self.re
392
393
394class SwitchCase(RE):
395    """
396    SwitchCase(re, nocase) is an RE which matches the same strings as RE,
397    but treating upper and lower case letters according to |nocase|. If
398    |nocase| is true, case is ignored, otherwise it is not.
399    """
400    re = None
401    nocase = None
402
403    def __init__(self, re, nocase):
404        self.re = re
405        self.nocase = nocase
406        self.nullable = re.nullable
407        self.match_nl = re.match_nl
408
409    def build_machine(self, m, initial_state, final_state, match_bol, nocase):
410        self.re.build_machine(m, initial_state, final_state, match_bol,
411                                                    self.nocase)
412
413    def calc_str(self):
414        if self.nocase:
415            name = "NoCase"
416        else:
417            name = "Case"
418        return "%s(%s)" % (name, self.re)
419
420#
421#     Composite RE constructors
422#     -------------------------
423#
424#     These REs are defined in terms of the primitive REs.
425#
426
427Empty = Seq()
428Empty.__doc__ = \
429    """
430    Empty is an RE which matches the empty string.
431    """
432Empty.str = "Empty"
433
434def Str1(s):
435    """
436    Str1(s) is an RE which matches the literal string |s|.
437    """
438    result = Seq(*tuple(map(Char, s)))
439    result.str = "Str(%s)" % repr(s)
440    return result
441
442def Str(*strs):
443    """
444    Str(s) is an RE which matches the literal string |s|.
445    Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
446    """
447    if len(strs) == 1:
448        return Str1(strs[0])
449    else:
450        result = Alt(*tuple(map(Str1, strs)))
451        result.str = "Str(%s)" % ','.join(map(repr, strs))
452        return result
453
454def Any(s):
455    """
456    Any(s) is an RE which matches any character in the string |s|.
457    """
458    #result = apply(Alt, tuple(map(Char, s)))
459    result = CodeRanges(chars_to_ranges(s))
460    result.str = "Any(%s)" % repr(s)
461    return result
462
463def AnyBut(s):
464    """
465    AnyBut(s) is an RE which matches any character (including
466    newline) which is not in the string |s|.
467    """
468    ranges = chars_to_ranges(s)
469    ranges.insert(0, -maxint)
470    ranges.append(maxint)
471    result = CodeRanges(ranges)
472    result.str = "AnyBut(%s)" % repr(s)
473    return result
474
475AnyChar = AnyBut("")
476AnyChar.__doc__ = \
477    """
478    AnyChar is an RE which matches any single character (including a newline).
479    """
480AnyChar.str = "AnyChar"
481
482def Range(s1, s2 = None):
483    """
484    Range(c1, c2) is an RE which matches any single character in the range
485    |c1| to |c2| inclusive.
486    Range(s) where |s| is a string of even length is an RE which matches
487    any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
488    """
489    if s2:
490        result = CodeRange(ord(s1), ord(s2) + 1)
491        result.str = "Range(%s,%s)" % (s1, s2)
492    else:
493        ranges = []
494        for i in range(0, len(s1), 2):
495            ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))
496        result = Alt(*ranges)
497        result.str = "Range(%s)" % repr(s1)
498    return result
499
500def Opt(re):
501    """
502    Opt(re) is an RE which matches either |re| or the empty string.
503    """
504    result = Alt(re, Empty)
505    result.str = "Opt(%s)" % re
506    return result
507
508def Rep(re):
509    """
510    Rep(re) is an RE which matches zero or more repetitions of |re|.
511    """
512    result = Opt(Rep1(re))
513    result.str = "Rep(%s)" % re
514    return result
515
516def NoCase(re):
517    """
518    NoCase(re) is an RE which matches the same strings as RE, but treating
519    upper and lower case letters as equivalent.
520    """
521    return SwitchCase(re, nocase = 1)
522
523def Case(re):
524    """
525    Case(re) is an RE which matches the same strings as RE, but treating
526    upper and lower case letters as distinct, i.e. it cancels the effect
527    of any enclosing NoCase().
528    """
529    return SwitchCase(re, nocase = 0)
530
531#
532#     RE Constants
533#
534
535Bol = Char(BOL)
536Bol.__doc__ = \
537    """
538    Bol is an RE which matches the beginning of a line.
539    """
540Bol.str = "Bol"
541
542Eol = Char(EOL)
543Eol.__doc__ = \
544    """
545    Eol is an RE which matches the end of a line.
546    """
547Eol.str = "Eol"
548
549Eof = Char(EOF)
550Eof.__doc__ = \
551    """
552    Eof is an RE which matches the end of the file.
553    """
554Eof.str = "Eof"
555
556