1#!/usr/bin/env python 2# 3# Copyright 2007 The Closure Linter Authors. All Rights Reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS-IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17"""Regular expression based JavaScript parsing classes.""" 18 19__author__ = ('robbyw@google.com (Robert Walker)', 20 'ajp@google.com (Andy Perelson)') 21 22import copy 23import re 24 25from closure_linter import javascripttokens 26from closure_linter.common import matcher 27from closure_linter.common import tokenizer 28 29# Shorthand 30Type = javascripttokens.JavaScriptTokenType 31Matcher = matcher.Matcher 32 33 34class JavaScriptModes(object): 35 """Enumeration of the different matcher modes used for JavaScript.""" 36 TEXT_MODE = 'text' 37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string' 38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' 39 BLOCK_COMMENT_MODE = 'block_comment' 40 DOC_COMMENT_MODE = 'doc_comment' 41 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' 42 LINE_COMMENT_MODE = 'line_comment' 43 PARAMETER_MODE = 'parameter' 44 FUNCTION_MODE = 'function' 45 46 47class JavaScriptTokenizer(tokenizer.Tokenizer): 48 """JavaScript tokenizer. 49 50 Convert JavaScript code in to an array of tokens. 51 """ 52 53 # Useful patterns for JavaScript parsing. 54 IDENTIFIER_CHAR = r'A-Za-z0-9_$' 55 56 # Number patterns based on: 57 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html 58 MANTISSA = r""" 59 (\d+(?!\.)) | # Matches '10' 60 (\d+\.(?!\d)) | # Matches '10.' 61 (\d*\.\d+) # Matches '.5' or '10.5' 62 """ 63 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA 64 HEX_LITERAL = r'0[xX][0-9a-fA-F]+' 65 NUMBER = re.compile(r""" 66 ((%s)|(%s)) 67 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) 68 69 # Strings come in three parts - first we match the start of the string, then 70 # the contents, then the end. The contents consist of any character except a 71 # backslash or end of string, or a backslash followed by any character, or a 72 # backslash followed by end of line to support correct parsing of multi-line 73 # strings. 74 SINGLE_QUOTE = re.compile(r"'") 75 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") 76 DOUBLE_QUOTE = re.compile(r'"') 77 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') 78 79 START_SINGLE_LINE_COMMENT = re.compile(r'//') 80 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') 81 82 START_DOC_COMMENT = re.compile(r'/\*\*') 83 START_BLOCK_COMMENT = re.compile(r'/\*') 84 END_BLOCK_COMMENT = re.compile(r'\*/') 85 BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') 86 87 # Comment text is anything that we are not going to parse into another special 88 # token like (inline) flags or end comments. Complicated regex to match 89 # most normal characters, and '*', '{', '}', and '@' when we are sure that 90 # it is safe. Expression [^*{\s]@ must come first, or the other options will 91 # match everything before @, and we won't match @'s that aren't part of flags 92 # like in email addresses in the @author tag. 93 DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') 94 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') 95 # Match anything that is allowed in a type definition, except for tokens 96 # needed to parse it (and the lookahead assertion for "*/"). 97 DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+') 98 99 # Match the prefix ' * ' that starts every line of jsdoc. Want to include 100 # spaces after the '*', but nothing else that occurs after a '*', and don't 101 # want to match the '*' in '*/'. 102 DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') 103 104 START_BLOCK = re.compile('{') 105 END_BLOCK = re.compile('}') 106 107 REGEX_CHARACTER_CLASS = r""" 108 \[ # Opening bracket 109 ([^\]\\]|\\.)* # Anything but a ] or \, 110 # or a backslash followed by anything 111 \] # Closing bracket 112 """ 113 # We ensure the regex is followed by one of the above tokens to avoid 114 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z 115 POST_REGEX_LIST = [ 116 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] 117 118 REGEX = re.compile(r""" 119 / # opening slash 120 (?!\*) # not the start of a comment 121 (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, 122 # or anything but a / or [ or \, 123 # or a character class 124 / # closing slash 125 [gimsx]* # optional modifiers 126 (?=\s*(%s)) 127 """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), 128 re.VERBOSE) 129 130 ANYTHING = re.compile(r'.*') 131 PARAMETERS = re.compile(r'[^\)]+') 132 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') 133 134 FUNCTION_DECLARATION = re.compile(r'\bfunction\b') 135 136 OPENING_PAREN = re.compile(r'\(') 137 CLOSING_PAREN = re.compile(r'\)') 138 139 OPENING_BRACKET = re.compile(r'\[') 140 CLOSING_BRACKET = re.compile(r'\]') 141 142 # We omit these JS keywords from the list: 143 # function - covered by FUNCTION_DECLARATION. 144 # delete, in, instanceof, new, typeof - included as operators. 145 # this - included in identifiers. 146 # null, undefined - not included, should go in some "special constant" list. 147 KEYWORD_LIST = [ 148 'break', 149 'case', 150 'catch', 151 'continue', 152 'default', 153 'do', 154 'else', 155 'finally', 156 'for', 157 'if', 158 'return', 159 'switch', 160 'throw', 161 'try', 162 'var', 163 'while', 164 'with', 165 ] 166 167 # List of regular expressions to match as operators. Some notes: for our 168 # purposes, the comma behaves similarly enough to a normal operator that we 169 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary 170 # characters - this may not match some very esoteric uses of the in operator. 171 # Operators that are subsets of larger operators must come later in this list 172 # for proper matching, e.g., '>>' must come AFTER '>>>'. 173 OPERATOR_LIST = [ 174 ',', 175 r'\+\+', 176 '===', 177 '!==', 178 '>>>=', 179 '>>>', 180 '==', 181 '>=', 182 '<=', 183 '!=', 184 '<<=', 185 '>>=', 186 '<<', 187 '>>', 188 '=>', 189 '>', 190 '<', 191 r'\+=', 192 r'\+', 193 '--', 194 r'\^=', 195 '-=', 196 '-', 197 '/=', 198 '/', 199 r'\*=', 200 r'\*', 201 '%=', 202 '%', 203 '&&', 204 r'\|\|', 205 '&=', 206 '&', 207 r'\|=', 208 r'\|', 209 '=', 210 '!', 211 ':', 212 r'\?', 213 r'\^', 214 r'\bdelete\b', 215 r'\bin\b', 216 r'\binstanceof\b', 217 r'\bnew\b', 218 r'\btypeof\b', 219 r'\bvoid\b', 220 r'\.', 221 ] 222 OPERATOR = re.compile('|'.join(OPERATOR_LIST)) 223 224 WHITESPACE = re.compile(r'\s+') 225 SEMICOLON = re.compile(r';') 226 # Technically JavaScript identifiers can't contain '.', but we treat a set of 227 # nested identifiers as a single identifier, except for trailing dots. 228 NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR 229 IDENTIFIER = re.compile(NESTED_IDENTIFIER) 230 231 SIMPLE_LVALUE = re.compile(r""" 232 (?P<identifier>%s) # a valid identifier 233 (?=\s* # optional whitespace 234 \= # look ahead to equal sign 235 (?!=)) # not follwed by equal 236 """ % NESTED_IDENTIFIER, re.VERBOSE) 237 238 # A doc flag is a @ sign followed by non-space characters that appears at the 239 # beginning of the line, after whitespace, or after a '{'. The look-behind 240 # check is necessary to not match someone@google.com as a flag. 241 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') 242 # To properly parse parameter names and complex doctypes containing 243 # whitespace, we need to tokenize whitespace into a token after certain 244 # doctags. All statetracker.HAS_TYPE that are not listed here must not contain 245 # any whitespace in their types. 246 DOC_FLAG_LEX_SPACES = re.compile( 247 r'(^|(?<=\s))@(?P<name>%s)\b' % 248 '|'.join([ 249 'const', 250 'enum', 251 'export', 252 'extends', 253 'final', 254 'implements', 255 'package', 256 'param', 257 'private', 258 'protected', 259 'public', 260 'return', 261 'type', 262 'typedef' 263 ])) 264 265 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') 266 267 DOC_TYPE_BLOCK_START = re.compile(r'[<(]') 268 DOC_TYPE_BLOCK_END = re.compile(r'[>)]') 269 DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]') 270 271 # Star followed by non-slash, i.e a star that does not end a comment. 272 # This is used for TYPE_GROUP below. 273 SAFE_STAR = r'(\*(?!/))' 274 275 COMMON_DOC_MATCHERS = [ 276 # Find the end of the comment. 277 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, 278 JavaScriptModes.TEXT_MODE), 279 280 # Tokenize documented flags like @private. 281 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), 282 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, 283 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), 284 285 # Encountering a doc flag should leave lex spaces mode. 286 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), 287 288 # Tokenize braces so we can find types. 289 Matcher(START_BLOCK, Type.DOC_START_BRACE), 290 Matcher(END_BLOCK, Type.DOC_END_BRACE), 291 292 # And some more to parse types. 293 Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK), 294 Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK), 295 296 Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER), 297 Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT), 298 299 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] 300 301 # When text is not matched, it is given this default type based on mode. 302 # If unspecified in this map, the default default is Type.NORMAL. 303 JAVASCRIPT_DEFAULT_TYPES = { 304 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, 305 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT 306 } 307 308 @classmethod 309 def BuildMatchers(cls): 310 """Builds the token matcher group. 311 312 The token matcher groups work as follows: it is a list of Matcher objects. 313 The matchers will be tried in this order, and the first to match will be 314 returned. Hence the order is important because the matchers that come first 315 overrule the matchers that come later. 316 317 Returns: 318 The completed token matcher group. 319 """ 320 # Match a keyword string followed by a non-identifier character in order to 321 # not match something like doSomething as do + Something. 322 keyword = re.compile('(%s)((?=[^%s])|$)' % ( 323 '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR)) 324 return { 325 326 # Matchers for basic text mode. 327 JavaScriptModes.TEXT_MODE: [ 328 # Check a big group - strings, starting comments, and regexes - all 329 # of which could be intertwined. 'string with /regex/', 330 # /regex with 'string'/, /* comment with /regex/ and string */ (and 331 # so on) 332 Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT, 333 JavaScriptModes.DOC_COMMENT_MODE), 334 Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, 335 JavaScriptModes.BLOCK_COMMENT_MODE), 336 Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT, 337 Type.START_SINGLE_LINE_COMMENT), 338 Matcher(cls.START_SINGLE_LINE_COMMENT, 339 Type.START_SINGLE_LINE_COMMENT, 340 JavaScriptModes.LINE_COMMENT_MODE), 341 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, 342 JavaScriptModes.SINGLE_QUOTE_STRING_MODE), 343 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, 344 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), 345 Matcher(cls.REGEX, Type.REGEX), 346 347 # Next we check for start blocks appearing outside any of the items 348 # above. 349 Matcher(cls.START_BLOCK, Type.START_BLOCK), 350 Matcher(cls.END_BLOCK, Type.END_BLOCK), 351 352 # Then we search for function declarations. 353 Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, 354 JavaScriptModes.FUNCTION_MODE), 355 356 # Next, we convert non-function related parens to tokens. 357 Matcher(cls.OPENING_PAREN, Type.START_PAREN), 358 Matcher(cls.CLOSING_PAREN, Type.END_PAREN), 359 360 # Next, we convert brackets to tokens. 361 Matcher(cls.OPENING_BRACKET, Type.START_BRACKET), 362 Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET), 363 364 # Find numbers. This has to happen before operators because 365 # scientific notation numbers can have + and - in them. 366 Matcher(cls.NUMBER, Type.NUMBER), 367 368 # Find operators and simple assignments 369 Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE), 370 Matcher(cls.OPERATOR, Type.OPERATOR), 371 372 # Find key words and whitespace. 373 Matcher(keyword, Type.KEYWORD), 374 Matcher(cls.WHITESPACE, Type.WHITESPACE), 375 376 # Find identifiers. 377 Matcher(cls.IDENTIFIER, Type.IDENTIFIER), 378 379 # Finally, we convert semicolons to tokens. 380 Matcher(cls.SEMICOLON, Type.SEMICOLON)], 381 382 # Matchers for single quote strings. 383 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ 384 Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT), 385 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, 386 JavaScriptModes.TEXT_MODE)], 387 388 # Matchers for double quote strings. 389 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ 390 Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), 391 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, 392 JavaScriptModes.TEXT_MODE)], 393 394 # Matchers for block comments. 395 JavaScriptModes.BLOCK_COMMENT_MODE: [ 396 # First we check for exiting a block comment. 397 Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, 398 JavaScriptModes.TEXT_MODE), 399 400 # Match non-comment-ending text.. 401 Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)], 402 403 # Matchers for doc comments. 404 JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [ 405 Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)], 406 407 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [ 408 Matcher(cls.WHITESPACE, Type.COMMENT), 409 Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], 410 411 # Matchers for single line comments. 412 JavaScriptModes.LINE_COMMENT_MODE: [ 413 # We greedy match until the end of the line in line comment mode. 414 Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], 415 416 # Matchers for code after the function keyword. 417 JavaScriptModes.FUNCTION_MODE: [ 418 # Must match open paren before anything else and move into parameter 419 # mode, otherwise everything inside the parameter list is parsed 420 # incorrectly. 421 Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS, 422 JavaScriptModes.PARAMETER_MODE), 423 Matcher(cls.WHITESPACE, Type.WHITESPACE), 424 Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)], 425 426 # Matchers for function parameters 427 JavaScriptModes.PARAMETER_MODE: [ 428 # When in function parameter mode, a closing paren is treated 429 # specially. Everything else is treated as lines of parameters. 430 Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, 431 JavaScriptModes.TEXT_MODE), 432 Matcher(cls.PARAMETERS, Type.PARAMETERS, 433 JavaScriptModes.PARAMETER_MODE)]} 434 435 def __init__(self, parse_js_doc=True): 436 """Create a tokenizer object. 437 438 Args: 439 parse_js_doc: Whether to do detailed parsing of javascript doc comments, 440 or simply treat them as normal comments. Defaults to parsing JsDoc. 441 """ 442 matchers = self.BuildMatchers() 443 if not parse_js_doc: 444 # Make a copy so the original doesn't get modified. 445 matchers = copy.deepcopy(matchers) 446 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ 447 JavaScriptModes.BLOCK_COMMENT_MODE] 448 449 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, 450 self.JAVASCRIPT_DEFAULT_TYPES) 451 452 def _CreateToken(self, string, token_type, line, line_number, values=None): 453 """Creates a new JavaScriptToken object. 454 455 Args: 456 string: The string of input the token contains. 457 token_type: The type of token. 458 line: The text of the line this token is in. 459 line_number: The line number of the token. 460 values: A dict of named values within the token. For instance, a 461 function declaration may have a value called 'name' which captures the 462 name of the function. 463 """ 464 return javascripttokens.JavaScriptToken(string, token_type, line, 465 line_number, values, line_number) 466