1#=======================================================================
2#
3#   Python Lexical Analyser
4#
5#
6#   Scanning an input stream
7#
8#=======================================================================
9
10import cython
11cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
12
13import Errors
14from Regexps import BOL, EOL, EOF
15
16NOT_FOUND = object()
17
18class Scanner(object):
19  """
20  A Scanner is used to read tokens from a stream of characters
21  using the token set specified by a Plex.Lexicon.
22
23  Constructor:
24
25    Scanner(lexicon, stream, name = '')
26
27      See the docstring of the __init__ method for details.
28
29  Methods:
30
31    See the docstrings of the individual methods for more
32    information.
33
34    read() --> (value, text)
35      Reads the next lexical token from the stream.
36
37    position() --> (name, line, col)
38      Returns the position of the last token read using the
39      read() method.
40
41    begin(state_name)
42      Causes scanner to change state.
43
44    produce(value [, text])
45      Causes return of a token value to the caller of the
46      Scanner.
47
48  """
49
50#  lexicon = None        # Lexicon
51#  stream = None         # file-like object
52#  name = ''
53#  buffer = ''
54#  buf_start_pos = 0     # position in input of start of buffer
55#  next_pos = 0          # position in input of next char to read
56#  cur_pos = 0           # position in input of current char
57#  cur_line = 1          # line number of current char
58#  cur_line_start = 0    # position in input of start of current line
59#  start_pos = 0         # position in input of start of token
60#  start_line = 0        # line number of start of token
61#  start_col = 0         # position in line of start of token
62#  text = None           # text of last token read
63#  initial_state = None  # Node
64#  state_name = ''       # Name of initial state
65#  queue = None          # list of tokens to be returned
66#  trace = 0
67
68  def __init__(self, lexicon, stream, name = '', initial_pos = None):
69    """
70    Scanner(lexicon, stream, name = '')
71
72      |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
73      to be recognised.
74
75      |stream| can be a file object or anything which implements a
76      compatible read() method.
77
78      |name| is optional, and may be the name of the file being
79      scanned or any other identifying string.
80    """
81    self.trace = 0
82
83    self.buffer = u''
84    self.buf_start_pos = 0
85    self.next_pos = 0
86    self.cur_pos = 0
87    self.cur_line = 1
88    self.start_pos = 0
89    self.start_line = 0
90    self.start_col = 0
91    self.text = None
92    self.state_name = None
93
94    self.lexicon = lexicon
95    self.stream = stream
96    self.name = name
97    self.queue = []
98    self.initial_state = None
99    self.begin('')
100    self.next_pos = 0
101    self.cur_pos = 0
102    self.cur_line_start = 0
103    self.cur_char = BOL
104    self.input_state = 1
105    if initial_pos is not None:
106        self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
107
108  def read(self):
109    """
110    Read the next lexical token from the stream and return a
111    tuple (value, text), where |value| is the value associated with
112    the token as specified by the Lexicon, and |text| is the actual
113    string read from the stream. Returns (None, '') on end of file.
114    """
115    queue = self.queue
116    while not queue:
117      self.text, action = self.scan_a_token()
118      if action is None:
119        self.produce(None)
120        self.eof()
121      else:
122        value = action.perform(self, self.text)
123        if value is not None:
124          self.produce(value)
125    result = queue[0]
126    del queue[0]
127    return result
128
129  def scan_a_token(self):
130    """
131    Read the next input sequence recognised by the machine
132    and return (text, action). Returns ('', None) on end of
133    file.
134    """
135    self.start_pos = self.cur_pos
136    self.start_line = self.cur_line
137    self.start_col = self.cur_pos - self.cur_line_start
138    action = self.run_machine_inlined()
139    if action is not None:
140      if self.trace:
141        print("Scanner: read: Performing %s %d:%d" % (
142          action, self.start_pos, self.cur_pos))
143      text = self.buffer[self.start_pos - self.buf_start_pos :
144                         self.cur_pos   - self.buf_start_pos]
145      return (text, action)
146    else:
147      if self.cur_pos == self.start_pos:
148        if self.cur_char is EOL:
149          self.next_char()
150        if self.cur_char is None or self.cur_char is EOF:
151          return (u'', None)
152      raise Errors.UnrecognizedInput(self, self.state_name)
153
154  def run_machine_inlined(self):
155    """
156    Inlined version of run_machine for speed.
157    """
158    state = self.initial_state
159    cur_pos = self.cur_pos
160    cur_line = self.cur_line
161    cur_line_start = self.cur_line_start
162    cur_char = self.cur_char
163    input_state = self.input_state
164    next_pos = self.next_pos
165    buffer = self.buffer
166    buf_start_pos = self.buf_start_pos
167    buf_len = len(buffer)
168    b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
169              None, 0, 0, 0, u'', 0, 0
170    trace = self.trace
171    while 1:
172      if trace: #TRACE#
173        print("State %d, %d/%d:%s -->" % ( #TRACE#
174          state['number'], input_state, cur_pos, repr(cur_char)))  #TRACE#
175      # Begin inlined self.save_for_backup()
176      #action = state.action #@slow
177      action = state['action'] #@fast
178      if action is not None:
179        b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
180                  action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
181      # End inlined self.save_for_backup()
182      c = cur_char
183      #new_state = state.new_state(c) #@slow
184      new_state = state.get(c, NOT_FOUND) #@fast
185      if new_state is NOT_FOUND: #@fast
186        new_state = c and state.get('else') #@fast
187      if new_state:
188        if trace: #TRACE#
189          print("State %d" % new_state['number'])  #TRACE#
190        state = new_state
191        # Begin inlined: self.next_char()
192        if input_state == 1:
193          cur_pos = next_pos
194          # Begin inlined: c = self.read_char()
195          buf_index = next_pos - buf_start_pos
196          if buf_index < buf_len:
197            c = buffer[buf_index]
198            next_pos = next_pos + 1
199          else:
200            discard = self.start_pos - buf_start_pos
201            data = self.stream.read(0x1000)
202            buffer = self.buffer[discard:] + data
203            self.buffer = buffer
204            buf_start_pos = buf_start_pos + discard
205            self.buf_start_pos = buf_start_pos
206            buf_len = len(buffer)
207            buf_index = buf_index - discard
208            if data:
209              c = buffer[buf_index]
210              next_pos = next_pos + 1
211            else:
212              c = u''
213          # End inlined: c = self.read_char()
214          if c == u'\n':
215            cur_char = EOL
216            input_state = 2
217          elif not c:
218            cur_char = EOL
219            input_state = 4
220          else:
221            cur_char = c
222        elif input_state == 2:
223          cur_char = u'\n'
224          input_state = 3
225        elif input_state == 3:
226          cur_line = cur_line + 1
227          cur_line_start = cur_pos = next_pos
228          cur_char = BOL
229          input_state = 1
230        elif input_state == 4:
231          cur_char = EOF
232          input_state = 5
233        else: # input_state = 5
234          cur_char = u''
235        # End inlined self.next_char()
236      else: # not new_state
237        if trace: #TRACE#
238          print("blocked")  #TRACE#
239        # Begin inlined: action = self.back_up()
240        if b_action is not None:
241          (action, cur_pos, cur_line, cur_line_start,
242           cur_char, input_state, next_pos) = \
243                   (b_action, b_cur_pos, b_cur_line, b_cur_line_start,
244                    b_cur_char, b_input_state, b_next_pos)
245        else:
246          action = None
247        break # while 1
248        # End inlined: action = self.back_up()
249    self.cur_pos = cur_pos
250    self.cur_line = cur_line
251    self.cur_line_start = cur_line_start
252    self.cur_char = cur_char
253    self.input_state = input_state
254    self.next_pos     = next_pos
255    if trace: #TRACE#
256      if action is not None: #TRACE#
257        print("Doing %s" % action) #TRACE#
258    return action
259
260  def next_char(self):
261    input_state = self.input_state
262    if self.trace:
263      print("Scanner: next: %s [%d] %d" % (" "*20, input_state, self.cur_pos))
264    if input_state == 1:
265      self.cur_pos = self.next_pos
266      c = self.read_char()
267      if c == u'\n':
268        self.cur_char = EOL
269        self.input_state = 2
270      elif not c:
271        self.cur_char = EOL
272        self.input_state = 4
273      else:
274        self.cur_char = c
275    elif input_state == 2:
276      self.cur_char = u'\n'
277      self.input_state = 3
278    elif input_state == 3:
279      self.cur_line = self.cur_line + 1
280      self.cur_line_start = self.cur_pos = self.next_pos
281      self.cur_char = BOL
282      self.input_state = 1
283    elif input_state == 4:
284      self.cur_char = EOF
285      self.input_state = 5
286    else: # input_state = 5
287      self.cur_char = u''
288    if self.trace:
289      print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
290
291  def position(self):
292    """
293    Return a tuple (name, line, col) representing the location of
294    the last token read using the read() method. |name| is the
295    name that was provided to the Scanner constructor; |line|
296    is the line number in the stream (1-based); |col| is the
297    position within the line of the first character of the token
298    (0-based).
299    """
300    return (self.name, self.start_line, self.start_col)
301
302  def get_position(self):
303    """Python accessible wrapper around position(), only for error reporting.
304    """
305    return self.position()
306
307  def begin(self, state_name):
308    """Set the current state of the scanner to the named state."""
309    self.initial_state = (
310      self.lexicon.get_initial_state(state_name))
311    self.state_name = state_name
312
313  def produce(self, value, text = None):
314    """
315    Called from an action procedure, causes |value| to be returned
316    as the token value from read(). If |text| is supplied, it is
317    returned in place of the scanned text.
318
319    produce() can be called more than once during a single call to an action
320    procedure, in which case the tokens are queued up and returned one
321    at a time by subsequent calls to read(), until the queue is empty,
322    whereupon scanning resumes.
323    """
324    if text is None:
325      text = self.text
326    self.queue.append((value, text))
327
328  def eof(self):
329    """
330    Override this method if you want something to be done at
331    end of file.
332    """
333