JavascriptParserImpl.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.impl;
18
19import com.google.common.collect.Maps;
20import com.google.streamhtmlparser.ExternalState;
21import com.google.streamhtmlparser.JavascriptParser;
22import com.google.streamhtmlparser.util.HtmlUtils;
23import com.google.streamhtmlparser.util.JavascriptTokenBuffer;
24
25import java.util.Map;
26
27/**
28 * <p>Many comments copied almost verbatim from the original C version.
29 */
30public class JavascriptParserImpl extends GenericParser
31    implements JavascriptParser {
32
33  final static InternalState JS_TEXT;
34  final static InternalState JS_Q;
35  final static InternalState JS_Q_E;
36  final static InternalState JS_DQ;
37  final static InternalState JS_DQ_E;
38  final static InternalState JS_SLASH;
39  final static InternalState JS_REGEXP_SLASH;
40  final static InternalState JS_REGEXP;
41  final static InternalState JS_REGEXP_BRK;
42  final static InternalState JS_REGEXP_BRK_E;
43  final static InternalState JS_REGEXP_E;
44  final static InternalState JS_COM_LN;
45  final static InternalState JS_COM_ML;
46  final static InternalState JS_COM_ML_CLOSE;
47  final static InternalState JS_COM_AFTER;
48
49  static {
50    JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT");
51    JS_Q  = InternalState.getInstanceJavascript("JS_Q");
52    JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E");
53    JS_DQ = InternalState.getInstanceJavascript("JS_DQ");
54    JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E");
55    JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH");
56    JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP");
57    JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH");
58    JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E");
59    JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK");
60    JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E");
61    JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN");
62    JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML");
63    JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE");
64    JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER");
65  }
66
67  private static final Map<InternalState, ExternalState> STATE_MAPPING =
68      Maps.newHashMap();
69  static {
70    initializeStateMapping();
71  }
72
73  private static final ParserStateTable STATE_TABLE = new ParserStateTable();
74  static {
75    initializeParserStateTable();
76  }
77
78  private final JavascriptTokenBuffer ccBuffer;
79
80  /**
81   * Creates a {@code JavascriptParserImpl} object.
82   */
83  public JavascriptParserImpl() {
84    super(STATE_TABLE, STATE_MAPPING, JS_TEXT);
85    ccBuffer = new JavascriptTokenBuffer();
86  }
87
88  /**
89   * Creates a {@code JavascriptParserImpl} object that is a copy
90   * of the one provided.
91   *
92   * @param aJavascriptParserImpl the {@code JavascriptParserImpl} to copy
93   */
94  public JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl) {
95    super(aJavascriptParserImpl);
96    ccBuffer = new JavascriptTokenBuffer(aJavascriptParserImpl.ccBuffer);
97  }
98
99  @Override
100  public void reset() {
101    super.reset();
102    currentState = JS_TEXT;
103  }
104
105  @Override
106  protected InternalState handleEnterState(InternalState currentState,
107                                           InternalState expectedNextState,
108                                           char input) {
109    InternalState nextState = expectedNextState;
110    if (currentState == JS_SLASH) {
111      nextState = enterStateJsSlash(currentState, input);
112    } else if (currentState == JS_COM_AFTER) {
113      enterStateJsCommentAfter();
114    }
115    return nextState;
116  }
117
118  @Override
119  protected InternalState handleExitState(InternalState currentState,
120                                          InternalState expectedNextState,
121                                          char input) {
122    // Nothing to do - no handlers for exit states
123    return expectedNextState;
124  }
125
126  @Override
127  protected InternalState handleInState(InternalState currentState,
128                                        char input) {
129    if (currentState == JS_TEXT) {
130      inStateJsText(input);
131    }
132    return currentState;
133  }
134
135  /**
136   * Called every time we find a slash ('/') character in the javascript
137   * text (except for slashes that close comments or regexp literals).
138   *
139   * <p>Comment copied verbatim from the corresponding C-version.
140   *
141   * <p>Implements the logic to figure out if this slash character is a
142   * division operator or if it opens a regular expression literal.
143   * This is heavily inspired by the syntactic resynchronization
144   * for javascript 2.0:
145   *
146   * <p>When we receive a '/', we look at the previous non space character
147   * to figure out if it's the ending of a punctuator that can precede a
148   * regexp literal, in which case we assume the current '/' is part of a
149   * regular expression literal (or the opening of a javascript comment,
150   * but that part is dealt with in the state machine). The exceptions to
151   * this are unary operators, so we look back a second character to rule
152   * out '++' and '--'.
153   *
154   * <p> Although it is not straightforward to figure out if the binary
155   * operator is a postfix of the previous expression or a prefix of the
156   * regular expression, we rule out the later as it is an uncommon practice.
157   *
158   * <p>If we ruled out the previous token to be a valid regexp preceding
159   * punctuator, we extract the last identifier in the buffer and match
160   * against a list of keywords that are known to precede expressions in
161   * the grammar. If we get a match on any of these keywords, then we are
162   * opening a regular expression, if not, then we have a division operator.
163   *
164   * <p>Known cases that are accepted by the grammar but we handle
165   * differently, although I (falmeida) don't believe there is a
166   * legitimate usage for those:
167   *   Division of a regular expression: var result = /test/ / 5;
168   *   Prefix unary increment of a regular expression: var result = ++/test/;
169   *   Division of an object literal: { a: 1 } /x/.exec('x');
170   *
171   * @param state being entered to
172   * @param input character being processed
173   * @return state next state to go to, may be the same as the one we
174   *     were called with
175   *
176   * <a>http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html>
177   * Syntactic Resynchronization</a>
178   */
179  private InternalState enterStateJsSlash(InternalState state, char input) {
180
181    InternalState nextState = state;
182    int position = -1;
183
184    // Consume the last whitespace
185    if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) {
186      --position;
187    }
188
189    switch (ccBuffer.getChar(position)) {
190      // Ignore unary increment
191      case '+':
192        if (ccBuffer.getChar(position - 1) != '+') {
193          nextState = JS_REGEXP_SLASH;
194        }
195        break;
196      case '-':
197        // Ignore unary decrement
198        if (ccBuffer.getChar(position - 1) != '-') {
199          nextState = JS_REGEXP_SLASH;
200        }
201        break;
202        // List of punctuator endings except ), ], }, + and - *
203      case '=':
204      case '<':
205      case '>':
206      case '&':
207      case '|':
208      case '!':
209      case '%':
210      case '*':
211      case '/':
212      case ',':
213      case ';':
214      case '?':
215      case ':':
216      case '^':
217      case '~':
218      case '{':
219      case '(':
220      case '[':
221      case '}':
222      case '\0':
223        nextState = JS_REGEXP_SLASH;
224        break;
225      default:
226        String lastIdentifier = ccBuffer.getLastIdentifier();
227        if (lastIdentifier != null && HtmlUtils
228            .isJavascriptRegexpPrefix(lastIdentifier)) {
229          nextState = JS_REGEXP_SLASH;
230        }
231    }
232    ccBuffer.appendChar(input);
233    return nextState;
234  }
235
236  /**
237   * Called at the end of a javascript comment.
238   *
239   * <p>When we open a comment, the initial '/' was inserted into the ring
240   * buffer, but it is not a token and should be considered whitespace
241   * for parsing purposes.
242   *
243   * <p>When we first saw the '/' character, we didn't yet know if it was
244   * the beginning of a comment, a division operator, or a regexp.
245   *
246   * <p>In this function we just replace the inital '/' with a whitespace
247   * character, unless we had a preceding whitespace character, in which
248   * case we just remove the '/'. This is needed to ensure all spaces in
249   * the buffer are correctly folded.
250   */
251  private void enterStateJsCommentAfter() {
252    if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) {
253      ccBuffer.popChar();
254    } else {
255      ccBuffer.setChar(-1, ' ');
256    }
257  }
258
259  private void inStateJsText(char input) {
260    ccBuffer.appendChar(input);
261  }
262
263// ======================================================= //
264// SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
265// ======================================================= //
266
267  private static void registerMapping(InternalState internalState,
268                                      ExternalState externalState) {
269    STATE_MAPPING.put(internalState, externalState);
270  }
271
272  private static void initializeStateMapping() {
273    // Each parser implementation must map the error state appropriately.
274    registerMapping(InternalState.INTERNAL_ERROR_STATE,
275                    JavascriptParser.STATE_ERROR);
276
277    registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT);
278    registerMapping(JS_Q, JavascriptParser.STATE_Q);
279    registerMapping(JS_Q_E, JavascriptParser.STATE_Q);
280    registerMapping(JS_DQ, JavascriptParser.STATE_DQ);
281    registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ);
282    registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT);
283    registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT);
284    registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP);
285    registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP);
286    registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP);
287    registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP);
288    registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT);
289    registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT);
290    registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT);
291    registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT);
292  }
293
294  private static void registerTransition(String expression,
295                                         InternalState source,
296                                         InternalState to) {
297    // It seems to silly to go through a StateTableTransition here
298    // but it adds extra data checking.
299    StateTableTransition stt = new StateTableTransition(expression,
300                                                        source, to);
301    STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
302                              stt.getTo());
303  }
304
305  private static void initializeParserStateTable() {
306    registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT);
307    registerTransition("/", JS_COM_AFTER, JS_SLASH);
308    registerTransition("\"", JS_COM_AFTER, JS_DQ);
309    registerTransition("\'", JS_COM_AFTER, JS_Q);
310    registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML);
311    registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER);
312    registerTransition("[:default:]", JS_COM_ML, JS_COM_ML);
313    registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE);
314    registerTransition("[:default:]", JS_COM_LN,JS_COM_LN);
315    registerTransition("\n", JS_COM_LN,JS_COM_AFTER);
316    registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP);
317    registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK);
318    registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK);
319    registerTransition("]", JS_REGEXP_BRK, JS_REGEXP);
320    registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E);
321    registerTransition("[:default:]", JS_REGEXP, JS_REGEXP);
322    registerTransition("/", JS_REGEXP, JS_TEXT);
323    registerTransition("[", JS_REGEXP, JS_REGEXP_BRK);
324    registerTransition("\\", JS_REGEXP, JS_REGEXP_E);
325    registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP);
326    registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK);
327    registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E);
328    registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML);
329    registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN);
330    registerTransition("[:default:]", JS_SLASH, JS_TEXT);
331    registerTransition("*", JS_SLASH, JS_COM_ML);
332    registerTransition("/", JS_SLASH, JS_COM_LN);
333    registerTransition("[:default:]", JS_DQ_E,JS_DQ);
334    registerTransition("[:default:]", JS_DQ,JS_DQ);
335    registerTransition("\"", JS_DQ, JS_TEXT);
336    registerTransition("\\", JS_DQ, JS_DQ_E);
337    registerTransition("[:default:]", JS_Q_E,JS_Q);
338    registerTransition("[:default:]", JS_Q,JS_Q);
339    registerTransition("\'", JS_Q, JS_TEXT);
340    registerTransition("\\", JS_Q, JS_Q_E);
341    registerTransition("[:default:]", JS_TEXT, JS_TEXT);
342    registerTransition("/", JS_TEXT, JS_SLASH);
343    registerTransition("\"", JS_TEXT, JS_DQ);
344    registerTransition("\'", JS_TEXT, JS_Q);
345  }
346}