1#=======================================================================
2#
3#   Python Lexical Analyser
4#
5#   Traditional Regular Expression Syntax
6#
7#=======================================================================
8
9from Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
10from Errors import PlexError
11
12class RegexpSyntaxError(PlexError):
13  pass
14
15def re(s):
16  """
17  Convert traditional string representation of regular expression |s|
18  into Plex representation.
19  """
20  return REParser(s).parse_re()
21
22class REParser(object):
23
24  def __init__(self, s):
25    self.s = s
26    self.i = -1
27    self.end = 0
28    self.next()
29
30  def parse_re(self):
31    re = self.parse_alt()
32    if not self.end:
33      self.error("Unexpected %s" % repr(self.c))
34    return re
35
36  def parse_alt(self):
37    """Parse a set of alternative regexps."""
38    re = self.parse_seq()
39    if self.c == '|':
40      re_list = [re]
41      while self.c == '|':
42        self.next()
43        re_list.append(self.parse_seq())
44      re = Alt(*re_list)
45    return re
46
47  def parse_seq(self):
48    """Parse a sequence of regexps."""
49    re_list = []
50    while not self.end and not self.c in "|)":
51      re_list.append(self.parse_mod())
52    return Seq(*re_list)
53
54  def parse_mod(self):
55    """Parse a primitive regexp followed by *, +, ? modifiers."""
56    re = self.parse_prim()
57    while not self.end and self.c in "*+?":
58      if self.c == '*':
59        re = Rep(re)
60      elif self.c == '+':
61        re = Rep1(re)
62      else: # self.c == '?'
63        re = Opt(re)
64      self.next()
65    return re
66
67  def parse_prim(self):
68    """Parse a primitive regexp."""
69    c = self.get()
70    if c == '.':
71      re = AnyBut("\n")
72    elif c == '^':
73      re = Bol
74    elif c == '$':
75      re = Eol
76    elif c == '(':
77      re = self.parse_alt()
78      self.expect(')')
79    elif c == '[':
80      re = self.parse_charset()
81      self.expect(']')
82    else:
83      if c == '\\':
84        c = self.get()
85      re = Char(c)
86    return re
87
88  def parse_charset(self):
89    """Parse a charset. Does not include the surrounding []."""
90    char_list = []
91    invert = 0
92    if self.c == '^':
93      invert = 1
94      self.next()
95    if self.c == ']':
96      char_list.append(']')
97      self.next()
98    while not self.end and self.c != ']':
99      c1 = self.get()
100      if self.c == '-' and self.lookahead(1) != ']':
101        self.next()
102        c2 = self.get()
103        for a in xrange(ord(c1), ord(c2) + 1):
104          char_list.append(chr(a))
105      else:
106        char_list.append(c1)
107    chars = ''.join(char_list)
108    if invert:
109      return AnyBut(chars)
110    else:
111      return Any(chars)
112
113  def next(self):
114    """Advance to the next char."""
115    s = self.s
116    i = self.i = self.i + 1
117    if i < len(s):
118      self.c = s[i]
119    else:
120      self.c = ''
121      self.end = 1
122
123  def get(self):
124    if self.end:
125      self.error("Premature end of string")
126    c = self.c
127    self.next()
128    return c
129
130  def lookahead(self, n):
131    """Look ahead n chars."""
132    j = self.i + n
133    if j < len(self.s):
134      return self.s[j]
135    else:
136      return ''
137
138  def expect(self, c):
139    """
140    Expect to find character |c| at current position.
141    Raises an exception otherwise.
142    """
143    if self.c == c:
144      self.next()
145    else:
146      self.error("Missing %s" % repr(c))
147
148  def error(self, mess):
149    """Raise exception to signal syntax error in regexp."""
150    raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
151      repr(self.s), self.i, mess))
152
153
154
155