1#======================================================================= 2# 3# Python Lexical Analyser 4# 5# 6# Scanning an input stream 7# 8#======================================================================= 9 10import cython 11cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) 12 13import Errors 14from Regexps import BOL, EOL, EOF 15 16NOT_FOUND = object() 17 18class Scanner(object): 19 """ 20 A Scanner is used to read tokens from a stream of characters 21 using the token set specified by a Plex.Lexicon. 22 23 Constructor: 24 25 Scanner(lexicon, stream, name = '') 26 27 See the docstring of the __init__ method for details. 28 29 Methods: 30 31 See the docstrings of the individual methods for more 32 information. 33 34 read() --> (value, text) 35 Reads the next lexical token from the stream. 36 37 position() --> (name, line, col) 38 Returns the position of the last token read using the 39 read() method. 40 41 begin(state_name) 42 Causes scanner to change state. 43 44 produce(value [, text]) 45 Causes return of a token value to the caller of the 46 Scanner. 47 48 """ 49 50# lexicon = None # Lexicon 51# stream = None # file-like object 52# name = '' 53# buffer = '' 54# buf_start_pos = 0 # position in input of start of buffer 55# next_pos = 0 # position in input of next char to read 56# cur_pos = 0 # position in input of current char 57# cur_line = 1 # line number of current char 58# cur_line_start = 0 # position in input of start of current line 59# start_pos = 0 # position in input of start of token 60# start_line = 0 # line number of start of token 61# start_col = 0 # position in line of start of token 62# text = None # text of last token read 63# initial_state = None # Node 64# state_name = '' # Name of initial state 65# queue = None # list of tokens to be returned 66# trace = 0 67 68 def __init__(self, lexicon, stream, name = '', initial_pos = None): 69 """ 70 Scanner(lexicon, stream, name = '') 71 72 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens 73 to be recognised. 74 75 |stream| can be a file object or anything which implements a 76 compatible read() method. 77 78 |name| is optional, and may be the name of the file being 79 scanned or any other identifying string. 80 """ 81 self.trace = 0 82 83 self.buffer = u'' 84 self.buf_start_pos = 0 85 self.next_pos = 0 86 self.cur_pos = 0 87 self.cur_line = 1 88 self.start_pos = 0 89 self.start_line = 0 90 self.start_col = 0 91 self.text = None 92 self.state_name = None 93 94 self.lexicon = lexicon 95 self.stream = stream 96 self.name = name 97 self.queue = [] 98 self.initial_state = None 99 self.begin('') 100 self.next_pos = 0 101 self.cur_pos = 0 102 self.cur_line_start = 0 103 self.cur_char = BOL 104 self.input_state = 1 105 if initial_pos is not None: 106 self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2] 107 108 def read(self): 109 """ 110 Read the next lexical token from the stream and return a 111 tuple (value, text), where |value| is the value associated with 112 the token as specified by the Lexicon, and |text| is the actual 113 string read from the stream. Returns (None, '') on end of file. 114 """ 115 queue = self.queue 116 while not queue: 117 self.text, action = self.scan_a_token() 118 if action is None: 119 self.produce(None) 120 self.eof() 121 else: 122 value = action.perform(self, self.text) 123 if value is not None: 124 self.produce(value) 125 result = queue[0] 126 del queue[0] 127 return result 128 129 def scan_a_token(self): 130 """ 131 Read the next input sequence recognised by the machine 132 and return (text, action). Returns ('', None) on end of 133 file. 134 """ 135 self.start_pos = self.cur_pos 136 self.start_line = self.cur_line 137 self.start_col = self.cur_pos - self.cur_line_start 138 action = self.run_machine_inlined() 139 if action is not None: 140 if self.trace: 141 print("Scanner: read: Performing %s %d:%d" % ( 142 action, self.start_pos, self.cur_pos)) 143 text = self.buffer[self.start_pos - self.buf_start_pos : 144 self.cur_pos - self.buf_start_pos] 145 return (text, action) 146 else: 147 if self.cur_pos == self.start_pos: 148 if self.cur_char is EOL: 149 self.next_char() 150 if self.cur_char is None or self.cur_char is EOF: 151 return (u'', None) 152 raise Errors.UnrecognizedInput(self, self.state_name) 153 154 def run_machine_inlined(self): 155 """ 156 Inlined version of run_machine for speed. 157 """ 158 state = self.initial_state 159 cur_pos = self.cur_pos 160 cur_line = self.cur_line 161 cur_line_start = self.cur_line_start 162 cur_char = self.cur_char 163 input_state = self.input_state 164 next_pos = self.next_pos 165 buffer = self.buffer 166 buf_start_pos = self.buf_start_pos 167 buf_len = len(buffer) 168 b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ 169 None, 0, 0, 0, u'', 0, 0 170 trace = self.trace 171 while 1: 172 if trace: #TRACE# 173 print("State %d, %d/%d:%s -->" % ( #TRACE# 174 state['number'], input_state, cur_pos, repr(cur_char))) #TRACE# 175 # Begin inlined self.save_for_backup() 176 #action = state.action #@slow 177 action = state['action'] #@fast 178 if action is not None: 179 b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ 180 action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos 181 # End inlined self.save_for_backup() 182 c = cur_char 183 #new_state = state.new_state(c) #@slow 184 new_state = state.get(c, NOT_FOUND) #@fast 185 if new_state is NOT_FOUND: #@fast 186 new_state = c and state.get('else') #@fast 187 if new_state: 188 if trace: #TRACE# 189 print("State %d" % new_state['number']) #TRACE# 190 state = new_state 191 # Begin inlined: self.next_char() 192 if input_state == 1: 193 cur_pos = next_pos 194 # Begin inlined: c = self.read_char() 195 buf_index = next_pos - buf_start_pos 196 if buf_index < buf_len: 197 c = buffer[buf_index] 198 next_pos = next_pos + 1 199 else: 200 discard = self.start_pos - buf_start_pos 201 data = self.stream.read(0x1000) 202 buffer = self.buffer[discard:] + data 203 self.buffer = buffer 204 buf_start_pos = buf_start_pos + discard 205 self.buf_start_pos = buf_start_pos 206 buf_len = len(buffer) 207 buf_index = buf_index - discard 208 if data: 209 c = buffer[buf_index] 210 next_pos = next_pos + 1 211 else: 212 c = u'' 213 # End inlined: c = self.read_char() 214 if c == u'\n': 215 cur_char = EOL 216 input_state = 2 217 elif not c: 218 cur_char = EOL 219 input_state = 4 220 else: 221 cur_char = c 222 elif input_state == 2: 223 cur_char = u'\n' 224 input_state = 3 225 elif input_state == 3: 226 cur_line = cur_line + 1 227 cur_line_start = cur_pos = next_pos 228 cur_char = BOL 229 input_state = 1 230 elif input_state == 4: 231 cur_char = EOF 232 input_state = 5 233 else: # input_state = 5 234 cur_char = u'' 235 # End inlined self.next_char() 236 else: # not new_state 237 if trace: #TRACE# 238 print("blocked") #TRACE# 239 # Begin inlined: action = self.back_up() 240 if b_action is not None: 241 (action, cur_pos, cur_line, cur_line_start, 242 cur_char, input_state, next_pos) = \ 243 (b_action, b_cur_pos, b_cur_line, b_cur_line_start, 244 b_cur_char, b_input_state, b_next_pos) 245 else: 246 action = None 247 break # while 1 248 # End inlined: action = self.back_up() 249 self.cur_pos = cur_pos 250 self.cur_line = cur_line 251 self.cur_line_start = cur_line_start 252 self.cur_char = cur_char 253 self.input_state = input_state 254 self.next_pos = next_pos 255 if trace: #TRACE# 256 if action is not None: #TRACE# 257 print("Doing %s" % action) #TRACE# 258 return action 259 260 def next_char(self): 261 input_state = self.input_state 262 if self.trace: 263 print("Scanner: next: %s [%d] %d" % (" "*20, input_state, self.cur_pos)) 264 if input_state == 1: 265 self.cur_pos = self.next_pos 266 c = self.read_char() 267 if c == u'\n': 268 self.cur_char = EOL 269 self.input_state = 2 270 elif not c: 271 self.cur_char = EOL 272 self.input_state = 4 273 else: 274 self.cur_char = c 275 elif input_state == 2: 276 self.cur_char = u'\n' 277 self.input_state = 3 278 elif input_state == 3: 279 self.cur_line = self.cur_line + 1 280 self.cur_line_start = self.cur_pos = self.next_pos 281 self.cur_char = BOL 282 self.input_state = 1 283 elif input_state == 4: 284 self.cur_char = EOF 285 self.input_state = 5 286 else: # input_state = 5 287 self.cur_char = u'' 288 if self.trace: 289 print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char))) 290 291 def position(self): 292 """ 293 Return a tuple (name, line, col) representing the location of 294 the last token read using the read() method. |name| is the 295 name that was provided to the Scanner constructor; |line| 296 is the line number in the stream (1-based); |col| is the 297 position within the line of the first character of the token 298 (0-based). 299 """ 300 return (self.name, self.start_line, self.start_col) 301 302 def get_position(self): 303 """Python accessible wrapper around position(), only for error reporting. 304 """ 305 return self.position() 306 307 def begin(self, state_name): 308 """Set the current state of the scanner to the named state.""" 309 self.initial_state = ( 310 self.lexicon.get_initial_state(state_name)) 311 self.state_name = state_name 312 313 def produce(self, value, text = None): 314 """ 315 Called from an action procedure, causes |value| to be returned 316 as the token value from read(). If |text| is supplied, it is 317 returned in place of the scanned text. 318 319 produce() can be called more than once during a single call to an action 320 procedure, in which case the tokens are queued up and returned one 321 at a time by subsequent calls to read(), until the queue is empty, 322 whereupon scanning resumes. 323 """ 324 if text is None: 325 text = self.text 326 self.queue.append((value, text)) 327 328 def eof(self): 329 """ 330 Override this method if you want something to be done at 331 end of file. 332 """ 333