15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright 2007 The Closure Linter Authors. All Rights Reserved. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Licensed under the Apache License, Version 2.0 (the "License"); 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# you may not use this file except in compliance with the License. 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# You may obtain a copy of the License at 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# http://www.apache.org/licenses/LICENSE-2.0 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Unless required by applicable law or agreed to in writing, software 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# distributed under the License is distributed on an "AS-IS" BASIS, 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# See the License for the specific language governing permissions and 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# limitations under the License. 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""Regular expression based lexer.""" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)__author__ = ('robbyw@google.com (Robert Walker)', 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 'ajp@google.com (Andy Perelson)') 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter.common import tokens 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Shorthand 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Type = tokens.TokenType 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Tokenizer(object): 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """General purpose tokenizer. 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Attributes: 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mode: The latest mode of the tokenizer. This allows patterns to distinguish 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if they are mid-comment, mid-parameter list, etc. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchers: Dictionary of modes to sequences of matchers that define the 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) patterns to check at any given time. 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default_types: Dictionary of modes to types, defining what type to give 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) non-matched text when in the given mode. Defaults to Type.NORMAL. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def __init__(self, starting_mode, matchers, default_types): 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Initialize the tokenizer. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) starting_mode: Mode to start in. 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchers: Dictionary of modes to sequences of matchers that defines the 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) patterns to check at any given time. 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default_types: Dictionary of modes to types, defining what type to give 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) non-matched text when in the given mode. Defaults to Type.NORMAL. 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__starting_mode = starting_mode 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.matchers = matchers 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.default_types = default_types 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def TokenizeFile(self, file): 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Tokenizes the given file. 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) file: An iterable that yields one line of the file at a time. 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) The first token in the file 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # The current mode. 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.mode = self.__starting_mode 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # The first token in the stream. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__first_token = None 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # The last token added to the token stream. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__last_token = None 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # The current line number. 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__line_number = 0 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for line in file: 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__line_number += 1 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__TokenizeLine(line) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return self.__first_token 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def _CreateToken(self, string, token_type, line, line_number, values=None): 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Creates a new Token object (or subclass). 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string: The string of input the token represents. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) token_type: The type of token. 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line: The text of the line this token is in. 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number: The line number of the token. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) values: A dict of named values within the token. For instance, a 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) function declaration may have a value called 'name' which captures the 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) name of the function. 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) The newly created Token object. 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return tokens.Token(string, token_type, line, line_number, values) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def __TokenizeLine(self, line): 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Tokenizes the given line. 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line: The contents of the line. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string = line.rstrip('\n\r\f') 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number = self.__line_number 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__start_index = 0 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not string: 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number)) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normal_token = '' 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) index = 0 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while index < len(string): 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for matcher in self.matchers[self.mode]: 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if matcher.line_start and index > 0: 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match = matcher.regex.match(string, index) 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if match: 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if normal_token: 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__AddToken( 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__CreateNormalToken(self.mode, normal_token, line, 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number)) 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normal_token = '' 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Add the match. 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__AddToken(self._CreateToken(match.group(), matcher.type, line, 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number, match.groupdict())) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Change the mode to the correct one for after this match. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.mode = matcher.result_mode or self.mode 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Shorten the string to be matched. 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) index = match.end() 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # If the for loop finishes naturally (i.e. no matches) we just add the 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # first character to the string of consecutive non match characters. 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # These will constitute a NORMAL token. 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if string: 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normal_token += string[index:index + 1] 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) index += 1 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if normal_token: 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__AddToken( 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__CreateNormalToken(self.mode, normal_token, line, line_number)) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def __CreateNormalToken(self, mode, string, line, line_number): 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Creates a normal token. 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mode: The current mode. 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string: The string to tokenize. 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line: The line of text. 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number: The line number within the file. 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) A Token object, of the default type for the current mode. 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) type = Type.NORMAL 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if mode in self.default_types: 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) type = self.default_types[mode] 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return self._CreateToken(string, type, line, line_number) 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) def __AddToken(self, token): 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Add the given token to the token stream. 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) token: The token to add. 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Store the first token, or point the previous token to this one. 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not self.__first_token: 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__first_token = token 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__last_token.next = token 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Establish the doubly linked list 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) token.previous = self.__last_token 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__last_token = token 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Compute the character indices 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) token.start_index = self.__start_index 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) self.__start_index += token.length 185