1#!/usr/bin/env python 2# 3# Copyright 2007 The Closure Linter Authors. All Rights Reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS-IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17"""Regular expression based lexer.""" 18 19__author__ = ('robbyw@google.com (Robert Walker)', 20 'ajp@google.com (Andy Perelson)') 21 22from closure_linter.common import tokens 23 24# Shorthand 25Type = tokens.TokenType 26 27 28class Tokenizer(object): 29 """General purpose tokenizer. 30 31 Attributes: 32 mode: The latest mode of the tokenizer. This allows patterns to distinguish 33 if they are mid-comment, mid-parameter list, etc. 34 matchers: Dictionary of modes to sequences of matchers that define the 35 patterns to check at any given time. 36 default_types: Dictionary of modes to types, defining what type to give 37 non-matched text when in the given mode. Defaults to Type.NORMAL. 38 """ 39 40 def __init__(self, starting_mode, matchers, default_types): 41 """Initialize the tokenizer. 42 43 Args: 44 starting_mode: Mode to start in. 45 matchers: Dictionary of modes to sequences of matchers that defines the 46 patterns to check at any given time. 47 default_types: Dictionary of modes to types, defining what type to give 48 non-matched text when in the given mode. Defaults to Type.NORMAL. 49 """ 50 self.__starting_mode = starting_mode 51 self.matchers = matchers 52 self.default_types = default_types 53 54 def TokenizeFile(self, file): 55 """Tokenizes the given file. 56 57 Args: 58 file: An iterable that yields one line of the file at a time. 59 60 Returns: 61 The first token in the file 62 """ 63 # The current mode. 64 self.mode = self.__starting_mode 65 # The first token in the stream. 66 self.__first_token = None 67 # The last token added to the token stream. 68 self.__last_token = None 69 # The current line number. 70 self.__line_number = 0 71 72 for line in file: 73 self.__line_number += 1 74 self.__TokenizeLine(line) 75 76 return self.__first_token 77 78 def _CreateToken(self, string, token_type, line, line_number, values=None): 79 """Creates a new Token object (or subclass). 80 81 Args: 82 string: The string of input the token represents. 83 token_type: The type of token. 84 line: The text of the line this token is in. 85 line_number: The line number of the token. 86 values: A dict of named values within the token. For instance, a 87 function declaration may have a value called 'name' which captures the 88 name of the function. 89 90 Returns: 91 The newly created Token object. 92 """ 93 return tokens.Token(string, token_type, line, line_number, values) 94 95 def __TokenizeLine(self, line): 96 """Tokenizes the given line. 97 98 Args: 99 line: The contents of the line. 100 """ 101 string = line.rstrip('\n\r\f') 102 line_number = self.__line_number 103 self.__start_index = 0 104 105 if not string: 106 self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number)) 107 return 108 109 normal_token = '' 110 index = 0 111 while index < len(string): 112 for matcher in self.matchers[self.mode]: 113 if matcher.line_start and index > 0: 114 continue 115 116 match = matcher.regex.match(string, index) 117 118 if match: 119 if normal_token: 120 self.__AddToken( 121 self.__CreateNormalToken(self.mode, normal_token, line, 122 line_number)) 123 normal_token = '' 124 125 # Add the match. 126 self.__AddToken(self._CreateToken(match.group(), matcher.type, line, 127 line_number, match.groupdict())) 128 129 # Change the mode to the correct one for after this match. 130 self.mode = matcher.result_mode or self.mode 131 132 # Shorten the string to be matched. 133 index = match.end() 134 135 break 136 137 else: 138 # If the for loop finishes naturally (i.e. no matches) we just add the 139 # first character to the string of consecutive non match characters. 140 # These will constitute a NORMAL token. 141 if string: 142 normal_token += string[index:index + 1] 143 index += 1 144 145 if normal_token: 146 self.__AddToken( 147 self.__CreateNormalToken(self.mode, normal_token, line, line_number)) 148 149 def __CreateNormalToken(self, mode, string, line, line_number): 150 """Creates a normal token. 151 152 Args: 153 mode: The current mode. 154 string: The string to tokenize. 155 line: The line of text. 156 line_number: The line number within the file. 157 158 Returns: 159 A Token object, of the default type for the current mode. 160 """ 161 type = Type.NORMAL 162 if mode in self.default_types: 163 type = self.default_types[mode] 164 return self._CreateToken(string, type, line, line_number) 165 166 def __AddToken(self, token): 167 """Add the given token to the token stream. 168 169 Args: 170 token: The token to add. 171 """ 172 # Store the first token, or point the previous token to this one. 173 if not self.__first_token: 174 self.__first_token = token 175 else: 176 self.__last_token.next = token 177 178 # Establish the doubly linked list 179 token.previous = self.__last_token 180 self.__last_token = token 181 182 # Compute the character indices 183 token.start_index = self.__start_index 184 self.__start_index += token.length 185