15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Licensed under the Apache License, Version 2.0 (the "License");
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# you may not use this file except in compliance with the License.
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# You may obtain a copy of the License at
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#      http://www.apache.org/licenses/LICENSE-2.0
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Unless required by applicable law or agreed to in writing, software
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# distributed under the License is distributed on an "AS-IS" BASIS,
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# See the License for the specific language governing permissions and
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# limitations under the License.
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""Regular expression based lexer."""
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)__author__ = ('robbyw@google.com (Robert Walker)',
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              'ajp@google.com (Andy Perelson)')
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter.common import tokens
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Shorthand
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Type = tokens.TokenType
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Tokenizer(object):
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """General purpose tokenizer.
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Attributes:
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mode: The latest mode of the tokenizer.  This allows patterns to distinguish
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if they are mid-comment, mid-parameter list, etc.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    matchers: Dictionary of modes to sequences of matchers that define the
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        patterns to check at any given time.
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default_types: Dictionary of modes to types, defining what type to give
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        non-matched text when in the given mode.  Defaults to Type.NORMAL.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def __init__(self, starting_mode, matchers, default_types):
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Initialize the tokenizer.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      starting_mode: Mode to start in.
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      matchers: Dictionary of modes to sequences of matchers that defines the
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          patterns to check at any given time.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      default_types: Dictionary of modes to types, defining what type to give
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          non-matched text when in the given mode.  Defaults to Type.NORMAL.
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__starting_mode = starting_mode
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.matchers = matchers
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.default_types = default_types
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def TokenizeFile(self, file):
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Tokenizes the given file.
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      file: An iterable that yields one line of the file at a time.
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Returns:
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      The first token in the file
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # The current mode.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.mode = self.__starting_mode
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # The first token in the stream.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__first_token = None
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # The last token added to the token stream.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__last_token = None
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # The current line number.
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__line_number = 0
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for line in file:
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__line_number += 1
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__TokenizeLine(line)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return self.__first_token
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def _CreateToken(self, string, token_type, line, line_number, values=None):
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Creates a new Token object (or subclass).
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      string: The string of input the token represents.
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      token_type: The type of token.
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      line: The text of the line this token is in.
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      line_number: The line number of the token.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      values: A dict of named values within the token.  For instance, a
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        function declaration may have a value called 'name' which captures the
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        name of the function.
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Returns:
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      The newly created Token object.
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return tokens.Token(string, token_type, line, line_number, values)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def __TokenizeLine(self, line):
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Tokenizes the given line.
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      line: The contents of the line.
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    string = line.rstrip('\n\r\f')
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    line_number = self.__line_number
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__start_index = 0
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if not string:
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number))
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    normal_token = ''
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    index = 0
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while index < len(string):
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for matcher in self.matchers[self.mode]:
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if matcher.line_start and index > 0:
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          continue
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        match = matcher.regex.match(string, index)
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if match:
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          if normal_token:
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            self.__AddToken(
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                self.__CreateNormalToken(self.mode, normal_token, line,
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         line_number))
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            normal_token = ''
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          # Add the match.
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          self.__AddToken(self._CreateToken(match.group(), matcher.type, line,
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                            line_number, match.groupdict()))
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          # Change the mode to the correct one for after this match.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          self.mode = matcher.result_mode or self.mode
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          # Shorten the string to be matched.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          index = match.end()
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      else:
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # If the for loop finishes naturally (i.e. no matches) we just add the
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # first character to the string of consecutive non match characters.
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # These will constitute a NORMAL token.
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if string:
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          normal_token += string[index:index + 1]
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          index += 1
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if normal_token:
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__AddToken(
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          self.__CreateNormalToken(self.mode, normal_token, line, line_number))
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def __CreateNormalToken(self, mode, string, line, line_number):
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Creates a normal token.
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      mode: The current mode.
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      string: The string to tokenize.
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      line: The line of text.
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      line_number: The line number within the file.
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Returns:
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      A Token object, of the default type for the current mode.
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    type = Type.NORMAL
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if mode in self.default_types:
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      type = self.default_types[mode]
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return self._CreateToken(string, type, line, line_number)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  def __AddToken(self, token):
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """Add the given token to the token stream.
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Args:
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      token: The token to add.
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    """
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Store the first token, or point the previous token to this one.
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if not self.__first_token:
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__first_token = token
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else:
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      self.__last_token.next = token
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Establish the doubly linked list
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    token.previous = self.__last_token
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__last_token = token
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Compute the character indices
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    token.start_index = self.__start_index
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    self.__start_index += token.length
185