1#!/usr/bin/env python
2#
3# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS-IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""Regular expression based JavaScript parsing classes."""
18
19__author__ = ('robbyw@google.com (Robert Walker)',
20              'ajp@google.com (Andy Perelson)')
21
22import copy
23import re
24
25from closure_linter import javascripttokens
26from closure_linter.common import matcher
27from closure_linter.common import tokenizer
28
29# Shorthand
30Type = javascripttokens.JavaScriptTokenType
31Matcher = matcher.Matcher
32
33
34class JavaScriptModes(object):
35  """Enumeration of the different matcher modes used for JavaScript."""
36  TEXT_MODE = 'text'
37  SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
38  DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
39  BLOCK_COMMENT_MODE = 'block_comment'
40  DOC_COMMENT_MODE = 'doc_comment'
41  DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
42  LINE_COMMENT_MODE = 'line_comment'
43  PARAMETER_MODE = 'parameter'
44  FUNCTION_MODE = 'function'
45
46
47class JavaScriptTokenizer(tokenizer.Tokenizer):
48  """JavaScript tokenizer.
49
50  Convert JavaScript code in to an array of tokens.
51  """
52
53  # Useful patterns for JavaScript parsing.
54  IDENTIFIER_CHAR = r'A-Za-z0-9_$'
55
56  # Number patterns based on:
57  # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
58  MANTISSA = r"""
59             (\d+(?!\.)) |                # Matches '10'
60             (\d+\.(?!\d)) |              # Matches '10.'
61             (\d*\.\d+)                   # Matches '.5' or '10.5'
62             """
63  DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
64  HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
65  NUMBER = re.compile(r"""
66                      ((%s)|(%s))
67                      """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
68
69  # Strings come in three parts - first we match the start of the string, then
70  # the contents, then the end.  The contents consist of any character except a
71  # backslash or end of string, or a backslash followed by any character, or a
72  # backslash followed by end of line to support correct parsing of multi-line
73  # strings.
74  SINGLE_QUOTE = re.compile(r"'")
75  SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
76  DOUBLE_QUOTE = re.compile(r'"')
77  DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
78
79  START_SINGLE_LINE_COMMENT = re.compile(r'//')
80  END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
81
82  START_DOC_COMMENT = re.compile(r'/\*\*')
83  START_BLOCK_COMMENT = re.compile(r'/\*')
84  END_BLOCK_COMMENT = re.compile(r'\*/')
85  BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
86
87  # Comment text is anything that we are not going to parse into another special
88  # token like (inline) flags or end comments. Complicated regex to match
89  # most normal characters, and '*', '{', '}', and '@' when we are sure that
90  # it is safe. Expression [^*{\s]@ must come first, or the other options will
91  # match everything before @, and we won't match @'s that aren't part of flags
92  # like in email addresses in the @author tag.
93  DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
94  DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
95  # Match anything that is allowed in a type definition, except for tokens
96  # needed to parse it (and the lookahead assertion for "*/").
97  DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+')
98
99  # Match the prefix ' * ' that starts every line of jsdoc. Want to include
100  # spaces after the '*', but nothing else that occurs after a '*', and don't
101  # want to match the '*' in '*/'.
102  DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
103
104  START_BLOCK = re.compile('{')
105  END_BLOCK = re.compile('}')
106
107  REGEX_CHARACTER_CLASS = r"""
108                          \[               # Opening bracket
109                          ([^\]\\]|\\.)*   # Anything but a ] or \,
110                                           # or a backslash followed by anything
111                          \]               # Closing bracket
112                          """
113  # We ensure the regex is followed by one of the above tokens to avoid
114  # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
115  POST_REGEX_LIST = [
116      ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
117
118  REGEX = re.compile(r"""
119                     /                      # opening slash
120                     (?!\*)                 # not the start of a comment
121                     (\\.|[^\[\/\\]|(%s))*  # a backslash followed by anything,
122                                            # or anything but a / or [ or \,
123                                            # or a character class
124                     /                      # closing slash
125                     [gimsx]*               # optional modifiers
126                     (?=\s*(%s))
127                     """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
128                     re.VERBOSE)
129
130  ANYTHING = re.compile(r'.*')
131  PARAMETERS = re.compile(r'[^\)]+')
132  CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
133
134  FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
135
136  OPENING_PAREN = re.compile(r'\(')
137  CLOSING_PAREN = re.compile(r'\)')
138
139  OPENING_BRACKET = re.compile(r'\[')
140  CLOSING_BRACKET = re.compile(r'\]')
141
142  # We omit these JS keywords from the list:
143  #   function - covered by FUNCTION_DECLARATION.
144  #   delete, in, instanceof, new, typeof - included as operators.
145  #   this - included in identifiers.
146  #   null, undefined - not included, should go in some "special constant" list.
147  KEYWORD_LIST = [
148      'break',
149      'case',
150      'catch',
151      'continue',
152      'default',
153      'do',
154      'else',
155      'finally',
156      'for',
157      'if',
158      'return',
159      'switch',
160      'throw',
161      'try',
162      'var',
163      'while',
164      'with',
165  ]
166
167  # List of regular expressions to match as operators.  Some notes: for our
168  # purposes, the comma behaves similarly enough to a normal operator that we
169  # include it here.  r'\bin\b' actually matches 'in' surrounded by boundary
170  # characters - this may not match some very esoteric uses of the in operator.
171  # Operators that are subsets of larger operators must come later in this list
172  # for proper matching, e.g., '>>' must come AFTER '>>>'.
173  OPERATOR_LIST = [
174      ',',
175      r'\+\+',
176      '===',
177      '!==',
178      '>>>=',
179      '>>>',
180      '==',
181      '>=',
182      '<=',
183      '!=',
184      '<<=',
185      '>>=',
186      '<<',
187      '>>',
188      '=>',
189      '>',
190      '<',
191      r'\+=',
192      r'\+',
193      '--',
194      r'\^=',
195      '-=',
196      '-',
197      '/=',
198      '/',
199      r'\*=',
200      r'\*',
201      '%=',
202      '%',
203      '&&',
204      r'\|\|',
205      '&=',
206      '&',
207      r'\|=',
208      r'\|',
209      '=',
210      '!',
211      ':',
212      r'\?',
213      r'\^',
214      r'\bdelete\b',
215      r'\bin\b',
216      r'\binstanceof\b',
217      r'\bnew\b',
218      r'\btypeof\b',
219      r'\bvoid\b',
220      r'\.',
221  ]
222  OPERATOR = re.compile('|'.join(OPERATOR_LIST))
223
224  WHITESPACE = re.compile(r'\s+')
225  SEMICOLON = re.compile(r';')
226  # Technically JavaScript identifiers can't contain '.', but we treat a set of
227  # nested identifiers as a single identifier, except for trailing dots.
228  NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR
229  IDENTIFIER = re.compile(NESTED_IDENTIFIER)
230
231  SIMPLE_LVALUE = re.compile(r"""
232                             (?P<identifier>%s)      # a valid identifier
233                             (?=\s*                  # optional whitespace
234                             \=                      # look ahead to equal sign
235                             (?!=))                  # not follwed by equal
236                             """ % NESTED_IDENTIFIER, re.VERBOSE)
237
238  # A doc flag is a @ sign followed by non-space characters that appears at the
239  # beginning of the line, after whitespace, or after a '{'.  The look-behind
240  # check is necessary to not match someone@google.com as a flag.
241  DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
242  # To properly parse parameter names and complex doctypes containing
243  # whitespace, we need to tokenize whitespace into a token after certain
244  # doctags. All statetracker.HAS_TYPE that are not listed here must not contain
245  # any whitespace in their types.
246  DOC_FLAG_LEX_SPACES = re.compile(
247      r'(^|(?<=\s))@(?P<name>%s)\b' %
248      '|'.join([
249          'const',
250          'enum',
251          'export',
252          'extends',
253          'final',
254          'implements',
255          'package',
256          'param',
257          'private',
258          'protected',
259          'public',
260          'return',
261          'type',
262          'typedef'
263      ]))
264
265  DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
266
267  DOC_TYPE_BLOCK_START = re.compile(r'[<(]')
268  DOC_TYPE_BLOCK_END = re.compile(r'[>)]')
269  DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]')
270
271  # Star followed by non-slash, i.e a star that does not end a comment.
272  # This is used for TYPE_GROUP below.
273  SAFE_STAR = r'(\*(?!/))'
274
275  COMMON_DOC_MATCHERS = [
276      # Find the end of the comment.
277      Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
278              JavaScriptModes.TEXT_MODE),
279
280      # Tokenize documented flags like @private.
281      Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
282      Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
283              JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
284
285      # Encountering a doc flag should leave lex spaces mode.
286      Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
287
288      # Tokenize braces so we can find types.
289      Matcher(START_BLOCK, Type.DOC_START_BRACE),
290      Matcher(END_BLOCK, Type.DOC_END_BRACE),
291
292      # And some more to parse types.
293      Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK),
294      Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK),
295
296      Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER),
297      Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT),
298
299      Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
300
301  # When text is not matched, it is given this default type based on mode.
302  # If unspecified in this map, the default default is Type.NORMAL.
303  JAVASCRIPT_DEFAULT_TYPES = {
304      JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
305      JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
306  }
307
308  @classmethod
309  def BuildMatchers(cls):
310    """Builds the token matcher group.
311
312    The token matcher groups work as follows: it is a list of Matcher objects.
313    The matchers will be tried in this order, and the first to match will be
314    returned.  Hence the order is important because the matchers that come first
315    overrule the matchers that come later.
316
317    Returns:
318      The completed token matcher group.
319    """
320    # Match a keyword string followed by a non-identifier character in order to
321    # not match something like doSomething as do + Something.
322    keyword = re.compile('(%s)((?=[^%s])|$)' % (
323        '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR))
324    return {
325
326        # Matchers for basic text mode.
327        JavaScriptModes.TEXT_MODE: [
328            # Check a big group - strings, starting comments, and regexes - all
329            # of which could be intertwined.  'string with /regex/',
330            # /regex with 'string'/, /* comment with /regex/ and string */ (and
331            # so on)
332            Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT,
333                    JavaScriptModes.DOC_COMMENT_MODE),
334            Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
335                    JavaScriptModes.BLOCK_COMMENT_MODE),
336            Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT,
337                    Type.START_SINGLE_LINE_COMMENT),
338            Matcher(cls.START_SINGLE_LINE_COMMENT,
339                    Type.START_SINGLE_LINE_COMMENT,
340                    JavaScriptModes.LINE_COMMENT_MODE),
341            Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
342                    JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
343            Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
344                    JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
345            Matcher(cls.REGEX, Type.REGEX),
346
347            # Next we check for start blocks appearing outside any of the items
348            # above.
349            Matcher(cls.START_BLOCK, Type.START_BLOCK),
350            Matcher(cls.END_BLOCK, Type.END_BLOCK),
351
352            # Then we search for function declarations.
353            Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
354                    JavaScriptModes.FUNCTION_MODE),
355
356            # Next, we convert non-function related parens to tokens.
357            Matcher(cls.OPENING_PAREN, Type.START_PAREN),
358            Matcher(cls.CLOSING_PAREN, Type.END_PAREN),
359
360            # Next, we convert brackets to tokens.
361            Matcher(cls.OPENING_BRACKET, Type.START_BRACKET),
362            Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET),
363
364            # Find numbers.  This has to happen before operators because
365            # scientific notation numbers can have + and - in them.
366            Matcher(cls.NUMBER, Type.NUMBER),
367
368            # Find operators and simple assignments
369            Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
370            Matcher(cls.OPERATOR, Type.OPERATOR),
371
372            # Find key words and whitespace.
373            Matcher(keyword, Type.KEYWORD),
374            Matcher(cls.WHITESPACE, Type.WHITESPACE),
375
376            # Find identifiers.
377            Matcher(cls.IDENTIFIER, Type.IDENTIFIER),
378
379            # Finally, we convert semicolons to tokens.
380            Matcher(cls.SEMICOLON, Type.SEMICOLON)],
381
382        # Matchers for single quote strings.
383        JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
384            Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
385            Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
386                    JavaScriptModes.TEXT_MODE)],
387
388        # Matchers for double quote strings.
389        JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
390            Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
391            Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
392                    JavaScriptModes.TEXT_MODE)],
393
394        # Matchers for block comments.
395        JavaScriptModes.BLOCK_COMMENT_MODE: [
396            # First we check for exiting a block comment.
397            Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
398                    JavaScriptModes.TEXT_MODE),
399
400            # Match non-comment-ending text..
401            Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)],
402
403        # Matchers for doc comments.
404        JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [
405            Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)],
406
407        JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [
408            Matcher(cls.WHITESPACE, Type.COMMENT),
409            Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
410
411        # Matchers for single line comments.
412        JavaScriptModes.LINE_COMMENT_MODE: [
413            # We greedy match until the end of the line in line comment mode.
414            Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
415
416        # Matchers for code after the function keyword.
417        JavaScriptModes.FUNCTION_MODE: [
418            # Must match open paren before anything else and move into parameter
419            # mode, otherwise everything inside the parameter list is parsed
420            # incorrectly.
421            Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS,
422                    JavaScriptModes.PARAMETER_MODE),
423            Matcher(cls.WHITESPACE, Type.WHITESPACE),
424            Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)],
425
426        # Matchers for function parameters
427        JavaScriptModes.PARAMETER_MODE: [
428            # When in function parameter mode, a closing paren is treated
429            # specially. Everything else is treated as lines of parameters.
430            Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
431                    JavaScriptModes.TEXT_MODE),
432            Matcher(cls.PARAMETERS, Type.PARAMETERS,
433                    JavaScriptModes.PARAMETER_MODE)]}
434
435  def __init__(self, parse_js_doc=True):
436    """Create a tokenizer object.
437
438    Args:
439      parse_js_doc: Whether to do detailed parsing of javascript doc comments,
440          or simply treat them as normal comments.  Defaults to parsing JsDoc.
441    """
442    matchers = self.BuildMatchers()
443    if not parse_js_doc:
444      # Make a copy so the original doesn't get modified.
445      matchers = copy.deepcopy(matchers)
446      matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
447          JavaScriptModes.BLOCK_COMMENT_MODE]
448
449    tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
450        self.JAVASCRIPT_DEFAULT_TYPES)
451
452  def _CreateToken(self, string, token_type, line, line_number, values=None):
453    """Creates a new JavaScriptToken object.
454
455    Args:
456      string: The string of input the token contains.
457      token_type: The type of token.
458      line: The text of the line this token is in.
459      line_number: The line number of the token.
460      values: A dict of named values within the token.  For instance, a
461        function declaration may have a value called 'name' which captures the
462        name of the function.
463    """
464    return javascripttokens.JavaScriptToken(string, token_type, line,
465                                            line_number, values, line_number)
466