15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright 2007 The Closure Linter Authors. All Rights Reserved. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Licensed under the Apache License, Version 2.0 (the "License"); 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# you may not use this file except in compliance with the License. 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# You may obtain a copy of the License at 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# http://www.apache.org/licenses/LICENSE-2.0 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Unless required by applicable law or agreed to in writing, software 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# distributed under the License is distributed on an "AS-IS" BASIS, 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# See the License for the specific language governing permissions and 155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)# limitations under the License. 165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)"""Regular expression based JavaScript parsing classes.""" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)__author__ = ('robbyw@google.com (Robert Walker)', 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 'ajp@google.com (Andy Perelson)') 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 22c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)import copy 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import re 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter import javascripttokens 262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)from closure_linter.common import matcher 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter.common import tokenizer 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Shorthand 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Type = javascripttokens.JavaScriptTokenType 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Matcher = matcher.Matcher 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class JavaScriptModes(object): 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Enumeration of the different matcher modes used for JavaScript.""" 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TEXT_MODE = 'text' 375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) SINGLE_QUOTE_STRING_MODE = 'single_quote_string' 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BLOCK_COMMENT_MODE = 'block_comment' 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOC_COMMENT_MODE = 'doc_comment' 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LINE_COMMENT_MODE = 'line_comment' 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PARAMETER_MODE = 'parameter' 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FUNCTION_MODE = 'function' 45c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)class JavaScriptTokenizer(tokenizer.Tokenizer): 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """JavaScript tokenizer. 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Convert JavaScript code in to an array of tokens. 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Useful patterns for JavaScript parsing. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IDENTIFIER_CHAR = r'A-Za-z0-9_$.' 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Number patterns based on: 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MANTISSA = r""" 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (\d+(?!\.)) | # Matches '10' 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (\d+\.(?!\d)) | # Matches '10.' 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (\d*\.\d+) # Matches '.5' or '10.5' 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA 642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) HEX_LITERAL = r'0[xX][0-9a-fA-F]+' 652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) NUMBER = re.compile(r""" 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ((%s)|(%s)) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Strings come in three parts - first we match the start of the string, then 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # the contents, then the end. The contents consist of any character except a 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # backslash or end of string, or a backslash followed by any character, or a 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # backslash followed by end of line to support correct parsing of multi-line 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # strings. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SINGLE_QUOTE = re.compile(r"'") 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOUBLE_QUOTE = re.compile(r'"') 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) START_SINGLE_LINE_COMMENT = re.compile(r'//') 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) START_DOC_COMMENT = re.compile(r'/\*\*') 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) START_BLOCK_COMMENT = re.compile(r'/\*') 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) END_BLOCK_COMMENT = re.compile(r'\*/') 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Comment text is anything that we are not going to parse into another special 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # token like (inline) flags or end comments. Complicated regex to match 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # most normal characters, and '*', '{', '}', and '@' when we are sure that 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # it is safe. Expression [^*{\s]@ must come first, or the other options will 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # match everything before @, and we won't match @'s that aren't part of flags 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # like in email addresses in the @author tag. 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Match the prefix ' * ' that starts every line of jsdoc. Want to include 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # spaces after the '*', but nothing else that occurs after a '*', and don't 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # want to match the '*' in '*/'. 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) START_BLOCK = re.compile('{') 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) END_BLOCK = re.compile('}') 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) REGEX_CHARACTER_CLASS = r""" 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) \[ # Opening bracket 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ([^\]\\]|\\.)* # Anything but a ] or \, 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # or a backslash followed by anything 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) \] # Closing bracket 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # We ensure the regex is followed by one of the above tokens to avoid 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # incorrectly parsing something like x / y / z as x REGEX(/ y /) z 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) POST_REGEX_LIST = [ 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) REGEX = re.compile(r""" 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) / # opening slash 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (?!\*) # not the start of a comment 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # or anything but a / or [ or \, 1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) # or a character class 1212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) / # closing slash 1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) [gimsx]* # optional modifiers 1232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) (?=\s*(%s)) 1242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), 1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) re.VERBOSE) 1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANYTHING = re.compile(r'.*') 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PARAMETERS = re.compile(r'[^\)]+') 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FUNCTION_DECLARATION = re.compile(r'\bfunction\b') 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OPENING_PAREN = re.compile(r'\(') 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CLOSING_PAREN = re.compile(r'\)') 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OPENING_BRACKET = re.compile(r'\[') 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CLOSING_BRACKET = re.compile(r'\]') 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # We omit these JS keywords from the list: 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # function - covered by FUNCTION_DECLARATION. 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # delete, in, instanceof, new, typeof - included as operators. 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # this - included in identifiers. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # null, undefined - not included, should go in some "special constant" list. 144eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else', 145eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var', 146eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch 'while', 'with'] 147eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch 148eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch # List of regular expressions to match as operators. Some notes: for our 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # purposes, the comma behaves similarly enough to a normal operator that we 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # include it here. r'\bin\b' actually matches 'in' surrounded by boundary 151c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) # characters - this may not match some very esoteric uses of the in operator. 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Operators that are subsets of larger operators must come later in this list 1535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) # for proper matching, e.g., '>>' must come AFTER '>>>'. 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=', 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+', 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%', 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?', 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r'\^', r'\bdelete\b', r'\bin\b', r'\binstanceof\b', 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r'\bnew\b', r'\btypeof\b', r'\bvoid\b'] 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OPERATOR = re.compile('|'.join(OPERATOR_LIST)) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WHITESPACE = re.compile(r'\s+') 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SEMICOLON = re.compile(r';') 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Technically JavaScript identifiers can't contain '.', but we treat a set of 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # nested identifiers as a single identifier. 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IDENTIFIER = re.compile(NESTED_IDENTIFIER) 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SIMPLE_LVALUE = re.compile(r""" 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (?P<identifier>%s) # a valid identifier 171 (?=\s* # optional whitespace 172 \= # look ahead to equal sign 173 (?!=)) # not follwed by equal 174 """ % NESTED_IDENTIFIER, re.VERBOSE) 175 176 # A doc flag is a @ sign followed by non-space characters that appears at the 177 # beginning of the line, after whitespace, or after a '{'. The look-behind 178 # check is necessary to not match someone@google.com as a flag. 179 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') 180 # To properly parse parameter names, we need to tokenize whitespace into a 181 # token. 182 DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' % 183 '|'.join(['param'])) 184 185 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') 186 187 # Star followed by non-slash, i.e a star that does not end a comment. 188 # This is used for TYPE_GROUP below. 189 SAFE_STAR = r'(\*(?!/))' 190 191 COMMON_DOC_MATCHERS = [ 192 # Find the end of the comment. 193 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, 194 JavaScriptModes.TEXT_MODE), 195 196 # Tokenize documented flags like @private. 197 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), 198 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, 199 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), 200 201 # Encountering a doc flag should leave lex spaces mode. 202 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), 203 204 # Tokenize braces so we can find types. 205 Matcher(START_BLOCK, Type.DOC_START_BRACE), 206 Matcher(END_BLOCK, Type.DOC_END_BRACE), 207 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] 208 209 # When text is not matched, it is given this default type based on mode. 210 # If unspecified in this map, the default default is Type.NORMAL. 211 JAVASCRIPT_DEFAULT_TYPES = { 212 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, 213 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT 214 } 215 216 @classmethod 217 def BuildMatchers(cls): 218 """Builds the token matcher group. 219 220 The token matcher groups work as follows: it is a list of Matcher objects. 221 The matchers will be tried in this order, and the first to match will be 222 returned. Hence the order is important because the matchers that come first 223 overrule the matchers that come later. 224 225 Returns: 226 The completed token matcher group. 227 """ 228 # Match a keyword string followed by a non-identifier character in order to 229 # not match something like doSomething as do + Something. 230 keyword = re.compile('(%s)((?=[^%s])|$)' % ( 231 '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR)) 232 return { 233 234 # Matchers for basic text mode. 235 JavaScriptModes.TEXT_MODE: [ 236 # Check a big group - strings, starting comments, and regexes - all 237 # of which could be intertwined. 'string with /regex/', 238 # /regex with 'string'/, /* comment with /regex/ and string */ (and 239 # so on) 240 Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT, 241 JavaScriptModes.DOC_COMMENT_MODE), 242 Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, 243 JavaScriptModes.BLOCK_COMMENT_MODE), 244 Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT, 245 Type.START_SINGLE_LINE_COMMENT), 246 Matcher(cls.START_SINGLE_LINE_COMMENT, 247 Type.START_SINGLE_LINE_COMMENT, 248 JavaScriptModes.LINE_COMMENT_MODE), 249 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, 250 JavaScriptModes.SINGLE_QUOTE_STRING_MODE), 251 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, 252 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), 253 Matcher(cls.REGEX, Type.REGEX), 254 255 # Next we check for start blocks appearing outside any of the items 256 # above. 257 Matcher(cls.START_BLOCK, Type.START_BLOCK), 258 Matcher(cls.END_BLOCK, Type.END_BLOCK), 259 260 # Then we search for function declarations. 261 Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, 262 JavaScriptModes.FUNCTION_MODE), 263 264 # Next, we convert non-function related parens to tokens. 265 Matcher(cls.OPENING_PAREN, Type.START_PAREN), 266 Matcher(cls.CLOSING_PAREN, Type.END_PAREN), 267 268 # Next, we convert brackets to tokens. 269 Matcher(cls.OPENING_BRACKET, Type.START_BRACKET), 270 Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET), 271 272 # Find numbers. This has to happen before operators because 273 # scientific notation numbers can have + and - in them. 274 Matcher(cls.NUMBER, Type.NUMBER), 275 276 # Find operators and simple assignments 277 Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE), 278 Matcher(cls.OPERATOR, Type.OPERATOR), 279 280 # Find key words and whitespace. 281 Matcher(keyword, Type.KEYWORD), 282 Matcher(cls.WHITESPACE, Type.WHITESPACE), 283 284 # Find identifiers. 285 Matcher(cls.IDENTIFIER, Type.IDENTIFIER), 286 287 # Finally, we convert semicolons to tokens. 288 Matcher(cls.SEMICOLON, Type.SEMICOLON)], 289 290 # Matchers for single quote strings. 291 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ 292 Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT), 293 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, 294 JavaScriptModes.TEXT_MODE)], 295 296 # Matchers for double quote strings. 297 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ 298 Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), 299 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, 300 JavaScriptModes.TEXT_MODE)], 301 302 # Matchers for block comments. 303 JavaScriptModes.BLOCK_COMMENT_MODE: [ 304 # First we check for exiting a block comment. 305 Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, 306 JavaScriptModes.TEXT_MODE), 307 308 # Match non-comment-ending text.. 309 Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)], 310 311 # Matchers for doc comments. 312 JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [ 313 Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)], 314 315 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [ 316 Matcher(cls.WHITESPACE, Type.COMMENT), 317 Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], 318 319 # Matchers for single line comments. 320 JavaScriptModes.LINE_COMMENT_MODE: [ 321 # We greedy match until the end of the line in line comment mode. 322 Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], 323 324 # Matchers for code after the function keyword. 325 JavaScriptModes.FUNCTION_MODE: [ 326 # Must match open paren before anything else and move into parameter 327 # mode, otherwise everything inside the parameter list is parsed 328 # incorrectly. 329 Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS, 330 JavaScriptModes.PARAMETER_MODE), 331 Matcher(cls.WHITESPACE, Type.WHITESPACE), 332 Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)], 333 334 # Matchers for function parameters 335 JavaScriptModes.PARAMETER_MODE: [ 336 # When in function parameter mode, a closing paren is treated 337 # specially. Everything else is treated as lines of parameters. 338 Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, 339 JavaScriptModes.TEXT_MODE), 340 Matcher(cls.PARAMETERS, Type.PARAMETERS, 341 JavaScriptModes.PARAMETER_MODE)]} 342 343 def __init__(self, parse_js_doc = True): 344 """Create a tokenizer object. 345 346 Args: 347 parse_js_doc: Whether to do detailed parsing of javascript doc comments, 348 or simply treat them as normal comments. Defaults to parsing JsDoc. 349 """ 350 matchers = self.BuildMatchers() 351 if not parse_js_doc: 352 # Make a copy so the original doesn't get modified. 353 matchers = copy.deepcopy(matchers) 354 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ 355 JavaScriptModes.BLOCK_COMMENT_MODE] 356 357 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, 358 self.JAVASCRIPT_DEFAULT_TYPES) 359 360 def _CreateToken(self, string, token_type, line, line_number, values=None): 361 """Creates a new JavaScriptToken object. 362 363 Args: 364 string: The string of input the token contains. 365 token_type: The type of token. 366 line: The text of the line this token is in. 367 line_number: The line number of the token. 368 values: A dict of named values within the token. For instance, a 369 function declaration may have a value called 'name' which captures the 370 name of the function. 371 """ 372 return javascripttokens.JavaScriptToken(string, token_type, line, 373 line_number, values, line_number) 374