15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Licensed under the Apache License, Version 2.0 (the "License");
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# you may not use this file except in compliance with the License.
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# You may obtain a copy of the License at
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#      http://www.apache.org/licenses/LICENSE-2.0
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Unless required by applicable law or agreed to in writing, software
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# distributed under the License is distributed on an "AS-IS" BASIS,
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# See the License for the specific language governing permissions and
155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)# limitations under the License.
165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)"""Regular expression based JavaScript parsing classes."""
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)__author__ = ('robbyw@google.com (Robert Walker)',
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              'ajp@google.com (Andy Perelson)')
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
22c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)import copy
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import re
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter import javascripttokens
262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)from closure_linter.common import matcher
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from closure_linter.common import tokenizer
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Shorthand
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Type = javascripttokens.JavaScriptTokenType
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Matcher = matcher.Matcher
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class JavaScriptModes(object):
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Enumeration of the different matcher modes used for JavaScript."""
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TEXT_MODE = 'text'
375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BLOCK_COMMENT_MODE = 'block_comment'
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOC_COMMENT_MODE = 'doc_comment'
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LINE_COMMENT_MODE = 'line_comment'
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PARAMETER_MODE = 'parameter'
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FUNCTION_MODE = 'function'
45c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)class JavaScriptTokenizer(tokenizer.Tokenizer):
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """JavaScript tokenizer.
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Convert JavaScript code in to an array of tokens.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Useful patterns for JavaScript parsing.
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IDENTIFIER_CHAR = r'A-Za-z0-9_$.'
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Number patterns based on:
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  MANTISSA = r"""
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             (\d+(?!\.)) |                # Matches '10'
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             (\d+\.(?!\d)) |              # Matches '10.'
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             (\d*\.\d+)                   # Matches '.5' or '10.5'
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             """
632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  NUMBER = re.compile(r"""
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      ((%s)|(%s))
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Strings come in three parts - first we match the start of the string, then
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # the contents, then the end.  The contents consist of any character except a
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # backslash or end of string, or a backslash followed by any character, or a
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # backslash followed by end of line to support correct parsing of multi-line
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # strings.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SINGLE_QUOTE = re.compile(r"'")
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOUBLE_QUOTE = re.compile(r'"')
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  START_SINGLE_LINE_COMMENT = re.compile(r'//')
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  START_DOC_COMMENT = re.compile(r'/\*\*')
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  START_BLOCK_COMMENT = re.compile(r'/\*')
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  END_BLOCK_COMMENT = re.compile(r'\*/')
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Comment text is anything that we are not going to parse into another special
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # token like (inline) flags or end comments. Complicated regex to match
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # most normal characters, and '*', '{', '}', and '@' when we are sure that
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # it is safe. Expression [^*{\s]@ must come first, or the other options will
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # match everything before @, and we won't match @'s that aren't part of flags
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # like in email addresses in the @author tag.
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Match the prefix ' * ' that starts every line of jsdoc. Want to include
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # spaces after the '*', but nothing else that occurs after a '*', and don't
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # want to match the '*' in '*/'.
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  START_BLOCK = re.compile('{')
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  END_BLOCK = re.compile('}')
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  REGEX_CHARACTER_CLASS = r"""
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          \[               # Opening bracket
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          ([^\]\\]|\\.)*   # Anything but a ] or \,
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                           # or a backslash followed by anything
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          \]               # Closing bracket
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          """
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # We ensure the regex is followed by one of the above tokens to avoid
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  POST_REGEX_LIST = [
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  REGEX = re.compile(r"""
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     /                      # opening slash
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     (?!\*)                 # not the start of a comment
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     (\\.|[^\[\/\\]|(%s))*  # a backslash followed by anything,
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                            # or anything but a / or [ or \,
1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                            # or a character class
1212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                     /                      # closing slash
1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                     [gimsx]*               # optional modifiers
1232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                     (?=\s*(%s))
1242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                     """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                     re.VERBOSE)
1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ANYTHING = re.compile(r'.*')
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PARAMETERS = re.compile(r'[^\)]+')
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OPENING_PAREN = re.compile(r'\(')
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CLOSING_PAREN = re.compile(r'\)')
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OPENING_BRACKET = re.compile(r'\[')
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CLOSING_BRACKET = re.compile(r'\]')
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # We omit these JS keywords from the list:
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   function - covered by FUNCTION_DECLARATION.
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   delete, in, instanceof, new, typeof - included as operators.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   this - included in identifiers.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   null, undefined - not included, should go in some "special constant" list.
144eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch  KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else',
145eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch      'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var',
146eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch      'while', 'with']
147eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch
148eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch  # List of regular expressions to match as operators.  Some notes: for our
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # purposes, the comma behaves similarly enough to a normal operator that we
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # include it here.  r'\bin\b' actually matches 'in' surrounded by boundary
151c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  # characters - this may not match some very esoteric uses of the in operator.
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Operators that are subsets of larger operators must come later in this list
1535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  # for proper matching, e.g., '>>' must come AFTER '>>>'.
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=',
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+',
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%',
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?',
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   r'\^', r'\bdelete\b', r'\bin\b', r'\binstanceof\b',
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   r'\bnew\b', r'\btypeof\b', r'\bvoid\b']
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OPERATOR = re.compile('|'.join(OPERATOR_LIST))
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  WHITESPACE = re.compile(r'\s+')
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SEMICOLON = re.compile(r';')
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Technically JavaScript identifiers can't contain '.', but we treat a set of
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # nested identifiers as a single identifier.
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IDENTIFIER = re.compile(NESTED_IDENTIFIER)
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SIMPLE_LVALUE = re.compile(r"""
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             (?P<identifier>%s)      # a valid identifier
171                             (?=\s*                  # optional whitespace
172                             \=                      # look ahead to equal sign
173                             (?!=))                  # not follwed by equal
174                             """ % NESTED_IDENTIFIER, re.VERBOSE)
175
176  # A doc flag is a @ sign followed by non-space characters that appears at the
177  # beginning of the line, after whitespace, or after a '{'.  The look-behind
178  # check is necessary to not match someone@google.com as a flag.
179  DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
180  # To properly parse parameter names, we need to tokenize whitespace into a
181  # token.
182  DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' %
183                                     '|'.join(['param']))
184
185  DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
186
187  # Star followed by non-slash, i.e a star that does not end a comment.
188  # This is used for TYPE_GROUP below.
189  SAFE_STAR = r'(\*(?!/))'
190
191  COMMON_DOC_MATCHERS = [
192      # Find the end of the comment.
193      Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
194              JavaScriptModes.TEXT_MODE),
195
196      # Tokenize documented flags like @private.
197      Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
198      Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
199              JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
200
201      # Encountering a doc flag should leave lex spaces mode.
202      Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
203
204      # Tokenize braces so we can find types.
205      Matcher(START_BLOCK, Type.DOC_START_BRACE),
206      Matcher(END_BLOCK, Type.DOC_END_BRACE),
207      Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
208
209  # When text is not matched, it is given this default type based on mode.
210  # If unspecified in this map, the default default is Type.NORMAL.
211  JAVASCRIPT_DEFAULT_TYPES = {
212      JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
213      JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
214  }
215
216  @classmethod
217  def BuildMatchers(cls):
218    """Builds the token matcher group.
219
220    The token matcher groups work as follows: it is a list of Matcher objects.
221    The matchers will be tried in this order, and the first to match will be
222    returned.  Hence the order is important because the matchers that come first
223    overrule the matchers that come later.
224
225    Returns:
226      The completed token matcher group.
227    """
228    # Match a keyword string followed by a non-identifier character in order to
229    # not match something like doSomething as do + Something.
230    keyword = re.compile('(%s)((?=[^%s])|$)' % (
231        '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR))
232    return {
233
234        # Matchers for basic text mode.
235        JavaScriptModes.TEXT_MODE: [
236            # Check a big group - strings, starting comments, and regexes - all
237            # of which could be intertwined.  'string with /regex/',
238            # /regex with 'string'/, /* comment with /regex/ and string */ (and
239            # so on)
240            Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT,
241                    JavaScriptModes.DOC_COMMENT_MODE),
242            Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
243                    JavaScriptModes.BLOCK_COMMENT_MODE),
244            Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT,
245                    Type.START_SINGLE_LINE_COMMENT),
246            Matcher(cls.START_SINGLE_LINE_COMMENT,
247                    Type.START_SINGLE_LINE_COMMENT,
248                    JavaScriptModes.LINE_COMMENT_MODE),
249            Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
250                    JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
251            Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
252                    JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
253            Matcher(cls.REGEX, Type.REGEX),
254
255            # Next we check for start blocks appearing outside any of the items
256            # above.
257            Matcher(cls.START_BLOCK, Type.START_BLOCK),
258            Matcher(cls.END_BLOCK, Type.END_BLOCK),
259
260            # Then we search for function declarations.
261            Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
262                    JavaScriptModes.FUNCTION_MODE),
263
264            # Next, we convert non-function related parens to tokens.
265            Matcher(cls.OPENING_PAREN, Type.START_PAREN),
266            Matcher(cls.CLOSING_PAREN, Type.END_PAREN),
267
268            # Next, we convert brackets to tokens.
269            Matcher(cls.OPENING_BRACKET, Type.START_BRACKET),
270            Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET),
271
272            # Find numbers.  This has to happen before operators because
273            # scientific notation numbers can have + and - in them.
274            Matcher(cls.NUMBER, Type.NUMBER),
275
276            # Find operators and simple assignments
277            Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
278            Matcher(cls.OPERATOR, Type.OPERATOR),
279
280            # Find key words and whitespace.
281            Matcher(keyword, Type.KEYWORD),
282            Matcher(cls.WHITESPACE, Type.WHITESPACE),
283
284            # Find identifiers.
285            Matcher(cls.IDENTIFIER, Type.IDENTIFIER),
286
287            # Finally, we convert semicolons to tokens.
288            Matcher(cls.SEMICOLON, Type.SEMICOLON)],
289
290        # Matchers for single quote strings.
291        JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
292            Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
293            Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
294                    JavaScriptModes.TEXT_MODE)],
295
296        # Matchers for double quote strings.
297        JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
298            Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
299            Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
300                    JavaScriptModes.TEXT_MODE)],
301
302        # Matchers for block comments.
303        JavaScriptModes.BLOCK_COMMENT_MODE: [
304            # First we check for exiting a block comment.
305            Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
306                    JavaScriptModes.TEXT_MODE),
307
308            # Match non-comment-ending text..
309            Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)],
310
311        # Matchers for doc comments.
312        JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [
313            Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)],
314
315        JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [
316            Matcher(cls.WHITESPACE, Type.COMMENT),
317            Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
318
319        # Matchers for single line comments.
320        JavaScriptModes.LINE_COMMENT_MODE: [
321            # We greedy match until the end of the line in line comment mode.
322            Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
323
324        # Matchers for code after the function keyword.
325        JavaScriptModes.FUNCTION_MODE: [
326            # Must match open paren before anything else and move into parameter
327            # mode, otherwise everything inside the parameter list is parsed
328            # incorrectly.
329            Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS,
330                    JavaScriptModes.PARAMETER_MODE),
331            Matcher(cls.WHITESPACE, Type.WHITESPACE),
332            Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)],
333
334        # Matchers for function parameters
335        JavaScriptModes.PARAMETER_MODE: [
336            # When in function parameter mode, a closing paren is treated
337            # specially. Everything else is treated as lines of parameters.
338            Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
339                    JavaScriptModes.TEXT_MODE),
340            Matcher(cls.PARAMETERS, Type.PARAMETERS,
341                    JavaScriptModes.PARAMETER_MODE)]}
342
343  def __init__(self, parse_js_doc = True):
344    """Create a tokenizer object.
345
346    Args:
347      parse_js_doc: Whether to do detailed parsing of javascript doc comments,
348          or simply treat them as normal comments.  Defaults to parsing JsDoc.
349    """
350    matchers = self.BuildMatchers()
351    if not parse_js_doc:
352      # Make a copy so the original doesn't get modified.
353      matchers = copy.deepcopy(matchers)
354      matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
355          JavaScriptModes.BLOCK_COMMENT_MODE]
356
357    tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
358        self.JAVASCRIPT_DEFAULT_TYPES)
359
360  def _CreateToken(self, string, token_type, line, line_number, values=None):
361    """Creates a new JavaScriptToken object.
362
363    Args:
364      string: The string of input the token contains.
365      token_type: The type of token.
366      line: The text of the line this token is in.
367      line_number: The line number of the token.
368      values: A dict of named values within the token.  For instance, a
369        function declaration may have a value called 'name' which captures the
370        name of the function.
371    """
372    return javascripttokens.JavaScriptToken(string, token_type, line,
373                                            line_number, values, line_number)
374