1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.impl;
18
19import com.google.common.base.Preconditions;
20import com.google.common.collect.Maps;
21import com.google.streamhtmlparser.ExternalState;
22import com.google.streamhtmlparser.HtmlParser;
23import com.google.streamhtmlparser.ParseException;
24import com.google.streamhtmlparser.util.CharacterRecorder;
25import com.google.streamhtmlparser.util.EntityResolver;
26import com.google.streamhtmlparser.util.HtmlUtils;
27
28import java.util.Map;
29
30/**
31 * A custom specialized parser - ported from the main C++ version - used to
32 * implement context-aware escaping of run-time data in web-application
33 * templates.
34 *
35 * <p>This is the main class in the package. It implements the
36 * {@code HtmlParser} interface.
37 *
38 * <p>This class is not thread-safe, in particular you cannot invoke any
39 * state changing operations (such as {@code parse} from multiple threads
40 * on the same object.
41 *
42 * <p>If you are looking at this class, chances are very high you are
43 * implementing Auto-Escaping for a new template system. Please see the
44 * landing page including a design document at
45 * <a href="http://go/autoescape">Auto-Escape Landing Page</a>.
46 */
47public class HtmlParserImpl extends GenericParser implements HtmlParser {
48
49  /*
50   * Internal representation of the parser state, which is at a
51   * finer-granularity than the external state as given to callers.
52   * The relationship between <code>InternalState</code> and
53   * <code>ExternalState</code> is a many-to-one relationship.
54   */
55  private static final InternalState TEXT;
56  private static final InternalState TAG_START;
57  private static final InternalState TAG_NAME;
58  private static final InternalState DECL_START;
59  private static final InternalState DECL_BODY;
60  private static final InternalState COM_OPEN;
61  private static final InternalState COM_BODY;
62  private static final InternalState COM_DASH;
63  private static final InternalState COM_DASH_DASH;
64  private static final InternalState PI;
65  private static final InternalState PI_MAY_END;
66  private static final InternalState TAG_SPACE;
67  private static final InternalState TAG_CLOSE;
68  private static final InternalState ATTR;
69  private static final InternalState ATTR_SPACE;
70  private static final InternalState VALUE;
71  private static final InternalState VALUE_TEXT;
72  private static final InternalState VALUE_Q_START;
73  private static final InternalState VALUE_Q;
74  private static final InternalState VALUE_DQ_START;
75  private static final InternalState VALUE_DQ;
76  private static final InternalState CDATA_COM_START;
77  private static final InternalState CDATA_COM_START_DASH;
78  private static final InternalState CDATA_COM_BODY;
79  private static final InternalState CDATA_COM_DASH;
80  private static final InternalState CDATA_COM_DASH_DASH;
81  private static final InternalState CDATA_TEXT;
82  private static final InternalState CDATA_LT;
83  private static final InternalState CDATA_MAY_CLOSE;
84  private static final InternalState JS_FILE;
85  private static final InternalState CSS_FILE;
86
87  static {
88    TEXT = InternalState.getInstanceHtml("TEXT");
89    TAG_START = InternalState.getInstanceHtml("TAG_START");
90    TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
91    DECL_START = InternalState.getInstanceHtml("DECL_START");
92    DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
93    COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
94    COM_BODY = InternalState.getInstanceHtml("COM_BODY");
95    COM_DASH = InternalState.getInstanceHtml("COM_DASH");
96    COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
97    PI =InternalState.getInstanceHtml("PI");
98    PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
99    TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
100    TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
101    ATTR = InternalState.getInstanceHtml("ATTR");
102    ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
103    VALUE = InternalState.getInstanceHtml("VALUE");
104    VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
105    VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
106    VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
107    VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
108    VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
109    CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
110    CDATA_COM_START_DASH =
111        InternalState.getInstanceHtml("CDATA_COM_START_DASH");
112    CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
113    CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
114    CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
115    CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
116    CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
117    CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
118    JS_FILE = InternalState.getInstanceHtml("JS_FILE");
119    CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
120  }
121
122  private static final Map<InternalState, ExternalState> STATE_MAPPING =
123      Maps.newHashMap();
124  static {
125    initializeStateMapping();
126  }
127
128  private static final ParserStateTable STATE_TABLE = new ParserStateTable();
129  static {
130    initializeParserStateTable();
131  }
132
133  private final CharacterRecorder tag;
134  private final CharacterRecorder attr;
135  private final CharacterRecorder value;
136  private final CharacterRecorder cdataCloseTag;
137  private final EntityResolver entityResolver;
138  private final JavascriptParserImpl jsParser;
139  private boolean insideJavascript;
140  private int valueIndex;
141  // True iff InsertText() was called at the start of a URL attribute value.
142  private boolean textInsideUrlValue;
143
144  /**
145   * Creates an {@code HtmlParserImpl} object.
146   *
147   * <p>Both for performance reasons and to leverage code a state-flow machine
148   * that is automatically generated from Python for multiple target
149   * languages, this object uses a static {@code ParserStateTable} that
150   * is read-only and obtained from the generated code in {@code HtmlParserFsm}.
151   * That code also maintains the mapping from internal states
152   * ({@code InternalState}) to external states ({@code ExternalState}).
153   */
154  public HtmlParserImpl() {
155    super(STATE_TABLE, STATE_MAPPING, TEXT);
156    tag = new CharacterRecorder();
157    attr = new CharacterRecorder();
158    value = new CharacterRecorder();
159    cdataCloseTag = new CharacterRecorder();
160    entityResolver = new EntityResolver();
161    jsParser = new JavascriptParserImpl();
162    insideJavascript = false;
163    valueIndex = 0;
164    textInsideUrlValue = false;
165  }
166
167  /**
168   * Creates an {@code HtmlParserImpl} that is a copy of the one provided.
169   *
170   * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
171   */
172  public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
173    super(aHtmlParserImpl);
174    tag = new CharacterRecorder(aHtmlParserImpl.tag);
175    attr = new CharacterRecorder(aHtmlParserImpl.attr);
176    value = new CharacterRecorder(aHtmlParserImpl.value);
177    cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
178    entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
179    jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
180    insideJavascript = aHtmlParserImpl.insideJavascript;
181    valueIndex = aHtmlParserImpl.valueIndex;
182    textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
183  }
184
185  @Override
186  public boolean inJavascript() {
187    return (insideJavascript
188            && ( (getState() == STATE_VALUE)
189                 || (currentState == CDATA_TEXT)
190                 || (currentState == CDATA_COM_START)
191                 || (currentState == CDATA_COM_START_DASH)
192                 || (currentState == CDATA_COM_BODY)
193                 || (currentState == CDATA_COM_DASH)
194                 || (currentState == CDATA_COM_DASH_DASH)
195                 || (currentState == CDATA_LT)
196                 || (currentState == CDATA_MAY_CLOSE)
197                 || (currentState == JS_FILE) ));
198  }
199
200  @Override
201  public boolean isJavascriptQuoted() {
202    if (inJavascript()) {
203      ExternalState jsParserState = jsParser.getState();
204      return (jsParserState == JavascriptParserImpl.STATE_Q
205              || jsParserState == JavascriptParserImpl.STATE_DQ);
206    }
207    return false;
208  }
209
210  @Override
211  public boolean inAttribute() {
212    ExternalState extState = getState();
213    return (extState != null && (extState == STATE_ATTR
214                                 || extState == STATE_VALUE));
215  }
216
217  /**
218   * Returns {@code true} if and only if the parser is currently within
219   * a CSS context. A CSS context is one of the below:
220   * <ul>
221   * <li>Inside a STYLE tag.
222   * <li>Inside a STYLE attribute.
223   * <li>Inside a CSS file when the parser was reset in the CSS mode.
224   * </ul>
225   *
226   * @return {@code true} if and only if the parser is inside CSS
227   */
228  @Override
229  public boolean inCss() {
230    return (currentState == CSS_FILE
231            || (getState() == STATE_VALUE
232                && (getAttributeType() == ATTR_TYPE.STYLE))
233            || ("style".equals(getTag())));
234  }
235
236  @Override
237  public ATTR_TYPE getAttributeType() {
238    String attribute = getAttribute();
239    if (!inAttribute()) {
240      return ATTR_TYPE.NONE;
241    }
242    if (HtmlUtils.isAttributeJavascript(attribute)) {
243      return ATTR_TYPE.JS;
244    }
245    if (HtmlUtils.isAttributeUri(attribute)) {
246      return ATTR_TYPE.URI;
247    }
248    if (HtmlUtils.isAttributeStyle(attribute)) {
249      return ATTR_TYPE.STYLE;
250    }
251
252    // Special logic to handle the "content" attribute of the "meta" tag.
253    if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
254      HtmlUtils.META_REDIRECT_TYPE redirectType =
255          HtmlUtils.parseContentAttributeForUrl(getValue());
256      if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
257          redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
258        return ATTR_TYPE.URI;
259    }
260
261    return ATTR_TYPE.REGULAR;
262  }
263
264  @Override
265  public ExternalState getJavascriptState() {
266    return jsParser.getState();
267  }
268
269  @Override
270  public boolean isAttributeQuoted() {
271    return (currentState == VALUE_Q_START
272            || currentState == VALUE_Q
273            || currentState == VALUE_DQ_START
274            || currentState == VALUE_DQ);
275  }
276
277  @Override
278  public String getTag() {
279    return tag.getContent().toLowerCase();
280  }
281
282  @Override
283  public String getAttribute() {
284    return inAttribute() ? attr.getContent().toLowerCase() : "";
285  }
286
287  @Override
288  public String getValue() {
289    return (getState() == STATE_VALUE) ? value.getContent() : "";
290  }
291
292  @Override
293  public int getValueIndex() {
294    if (getState() != STATE_VALUE) {
295      return 0;
296    }
297    return valueIndex;
298  }
299
300  @Override
301  public boolean isUrlStart() {
302    // False when not inside an HTML attribute value
303    if (getState() != STATE_VALUE) {
304      return false;
305    }
306
307    //  Or when the HTML attribute is not of URI type.
308    if (getAttributeType() != ATTR_TYPE.URI) {
309      return false;
310    }
311
312    // Or when we received an InsertText() directive at the start of a URL.
313    if (textInsideUrlValue) {
314      return false;
315    }
316
317    if ("meta".equals(getTag())) {
318      // At this point, we know we are in the "content" attribute
319      // or we would not have the URI attribute type.
320      return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
321              HtmlUtils.META_REDIRECT_TYPE.URL_START);
322    }
323
324    // For all other URI attributes, check if we are at index 0.
325    return (getValueIndex() == 0);
326}
327
328  /**
329   * {@inheritDoc}
330   *
331   * Resets the state of the parser to a state consistent with the
332   * {@code Mode} provided. This will reset finer-grained state
333   * information back to a default value, hence use only when
334   * you want to parse text from a very clean slate.
335   *
336   * <p>See the {@link HtmlParser.Mode} enum for information on all
337   * the valid modes.
338   *
339   * @param mode is an enum representing the high-level state of the parser
340   */
341  @Override
342  public void resetMode(Mode mode) {
343    insideJavascript = false;
344    tag.reset();
345    attr.reset();
346    value.reset();
347    cdataCloseTag.reset();
348    valueIndex = 0;
349    textInsideUrlValue = false;
350    jsParser.reset();
351
352    switch (mode) {
353      case HTML:
354        currentState = TEXT;
355        break;
356      case JS:
357        currentState = JS_FILE;
358        insideJavascript = true;
359        break;
360      case CSS:
361        currentState = CSS_FILE;
362        break;
363      case HTML_IN_TAG:
364        currentState = TAG_SPACE;
365        break;
366      default:
367        throw new IllegalArgumentException("Did not recognize Mode: " +
368                                           mode.toString());
369    }
370  }
371
372  /**
373   * Resets the state of the parser to the initial state of parsing HTML.
374   */
375  public void reset() {
376    super.reset();
377    resetMode(Mode.HTML);
378  }
379
380  /**
381   * A specialized directive to tell the parser there is some content
382   * that will be inserted here but that it will not get to parse. Used
383   * by the template system that may not be able to give some content
384   * to the parser but wants it to know there typically will be content
385   * inserted at that point.  This is a hint used in corner cases within
386   * parsing of HTML attribute names and values where content we do not
387   * get to see could affect our parsing and alter our current state.
388   *
389   * <p>The two cases where {@code #insertText()} affects our parsing are:
390   * <ul>
391   * <li>We are at the start of the value of a URL-accepting HTML attribute. In
392   * that case, we change internal state to no longer be considered at the
393   * start of the URL. This may affect what escaping template systems may want
394   * to perform on the HTML attribute value. We avoid injecting fake data and
395   * hence not modify the current index of the value as determined by
396   * {@link #getValueIndex()}</li>
397   * <li>We just transitioned from an attribute name to an attribute value
398   * (by parsing the separating {@code '='} character). In that case, we
399   * change internal state to be now inside a non-quoted HTML attribute
400   * value.</li>
401   * </ul>
402   *
403   * @throws ParseException if an unrecoverable error occurred during parsing
404   */
405  @Override
406  public void insertText() throws ParseException {
407    // Case: Inside URL attribute value.
408    if (getState() == STATE_VALUE
409        && getAttributeType() == ATTR_TYPE.URI
410        && isUrlStart()) {
411      textInsideUrlValue = true;
412    }
413    // Case: Before parsing any attribute value.
414    if (currentState == VALUE) {
415      setNextState(VALUE_TEXT);
416    }
417  }
418
419  @Override
420  protected InternalState handleEnterState(InternalState currentState,
421                                           InternalState expectedNextState,
422                                           char input) {
423    InternalState nextState = expectedNextState;
424    if (currentState == TAG_NAME) {
425      enterTagName();
426    } else if (currentState == ATTR) {
427      enterAttribute();
428    } else if (currentState == TAG_CLOSE) {
429      nextState = tagClose(currentState);
430    } else if (currentState == CDATA_MAY_CLOSE) {
431      enterStateCdataMayClose();
432    } else if (currentState == VALUE) {
433      enterValue();
434    } else
435    if (currentState == VALUE_TEXT || currentState == VALUE_Q
436        || currentState == VALUE_DQ) {
437      enterValueContent();
438    }
439    return nextState;
440  }
441
442  @Override
443  protected InternalState handleExitState(InternalState currentState,
444                                          InternalState expectedNextState,
445                                          char input) {
446    InternalState nextState = expectedNextState;
447    if (currentState == TAG_NAME) {
448      exitTagName();
449    } else if (currentState == ATTR) {
450      exitAttribute();
451    } else if (currentState == CDATA_MAY_CLOSE) {
452      nextState = exitStateCdataMayClose(nextState, input);
453    } else
454    if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
455        || (currentState == VALUE_DQ)) {
456      exitValueContent();
457    }
458    return nextState;
459  }
460
461  @Override
462  protected InternalState handleInState(InternalState currentState,
463                                        char input) throws ParseException {
464    if ((currentState == CDATA_TEXT)
465        || (currentState == CDATA_COM_START)
466        || (currentState == CDATA_COM_START_DASH)
467        || (currentState == CDATA_COM_BODY)
468        || (currentState == CDATA_COM_DASH)
469        || (currentState == CDATA_COM_DASH_DASH)
470        || (currentState == CDATA_LT)
471        || (currentState == CDATA_MAY_CLOSE)
472        || (currentState == JS_FILE)) {
473      inStateCdata(input);
474    } else if ((currentState == VALUE_TEXT)
475               || (currentState == VALUE_Q)
476               || (currentState == VALUE_DQ)) {
477      inStateValue(input);
478    }
479    return currentState;
480  }
481
482  /**
483   * Invokes recording on all CharacterRecorder objects. Currently we do
484   * not check that one and only one of them is recording. I did a fair
485   * bit of testing on the C++ parser and was not convinced there is
486   * such a guarantee.
487   */
488  @Override
489  protected void record(char input) {
490    attr.maybeRecord(input);
491    tag.maybeRecord(input);
492    value.maybeRecord(input);
493    cdataCloseTag.maybeRecord(input);
494  }
495
496  /**
497   * Starts recording the name of the HTML tag. Called when the parser
498   * enters a new tag.
499   */
500  private void enterTagName() {
501    tag.startRecording();
502  }
503
504  private void exitTagName() {
505    tag.stopRecording();
506    String tagString = tag.getContent();
507    if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
508      tag.reset();
509    }
510  }
511
512  /**
513   * Starts recording the name of the HTML attribute. Called when the parser
514   * enters a new HTML attribute.
515   */
516  private void enterAttribute() {
517    attr.startRecording();
518  }
519
520  private void exitAttribute() {
521    attr.stopRecording();
522  }
523
524  /**
525   * Tracks the index within the HTML attribute value and initializes
526   * the javascript parser for attributes that take javascript.
527   *
528   * Called when the parser enters a new HTML attribute value.
529   */
530  private void enterValue() {
531    valueIndex = 0;
532    textInsideUrlValue = false;
533    if (HtmlUtils.isAttributeJavascript(getAttribute())) {
534      entityResolver.reset();
535      jsParser.reset();
536      insideJavascript = true;
537    } else {
538      insideJavascript = false;
539    }
540  }
541
542  /**
543   * Starts recordning the contents of the attribute value.
544   *
545   * Called when entering an attribute value.
546   */
547  private void enterValueContent() {
548    value.startRecording();
549  }
550
551  /**
552   * Stops the recording of the attribute value and exits javascript
553   * (in case we were inside it).
554   */
555  private void exitValueContent() {
556    value.stopRecording();
557    insideJavascript = false;
558  }
559
560  /**
561   * Processes javascript after performing entity resolution and updates
562   * the position within the attribute value.
563   * If the status of the entity resolution is <code>IN_PROGRESS</code>,
564   * we don't invoke the javascript parser.
565   *
566   * <p>Called for every character inside an attribute value.
567   *
568   * @param input character read
569   * @throws ParseException if an unrecoverable error occurred during parsing
570   */
571  private void inStateValue(char input) throws ParseException {
572    valueIndex++;
573    if (insideJavascript) {
574      EntityResolver.Status status = entityResolver.processChar(input);
575      if (status == EntityResolver.Status.COMPLETED) {
576        jsParser.parse(entityResolver.getEntity());
577        entityResolver.reset();
578      } else if (status == EntityResolver.Status.NOT_STARTED) {
579        jsParser.parse(input);
580      }
581    }
582  }
583
584  /**
585   * Handles the tag it finished reading.
586   *
587   * <p>For a script tag, it initializes the javascript parser. For all
588   * tags that are recognized to have CDATA values
589   * (including the script tag), it switches the CDATA state to handle them
590   * properly. For code simplification, CDATA and RCDATA sections are
591   * treated the same.
592   *
593   * <p>Called when the parser leaves a tag definition.
594   *
595   * @param state current state
596   * @return state next state, could be the same as current state
597   */
598  private InternalState tagClose(InternalState state) {
599    InternalState nextState = state;
600    String tagName = getTag();
601    if ("script".equals(tagName)) {
602      nextState = CDATA_TEXT;
603      jsParser.reset();
604      insideJavascript = true;
605    } else if ("style".equals(tagName)
606                 || "title".equals(tagName)
607                 || "textarea".equals(tagName)) {
608      nextState = CDATA_TEXT;
609      insideJavascript = false;
610    }
611    return nextState;
612  }
613
614  /**
615   * Feeds the character to the javascript parser for processing.
616   *
617   * <p>Called inside CDATA blocks to parse javascript.
618   *
619   * @param input character read
620   * @throws ParseException if an unrecoverable error occurred during parsing
621   */
622  private void inStateCdata(char input) throws ParseException {
623    if (insideJavascript) {
624      jsParser.parse(input);
625    }
626  }
627
628  /**
629   * Starts recording. This is so we find the closing tag name in order to
630   * know if the tag is going to be closed or not.
631   *
632   * <p>Called when encountering a '<' character in a CDATA section.
633   */
634  private void enterStateCdataMayClose() {
635    cdataCloseTag.startRecording();
636  }
637
638  /**
639   * Determines whether to close the tag element, It closes it if it finds
640   * the corresponding end tag. Called when reading what could be a
641   * closing CDATA tag.
642   *
643   * @param input the character read
644   * @param expectedNextState the expected state to go to next
645   *        unless we want to change it here
646   * @return the next state to go to
647   */
648  private InternalState exitStateCdataMayClose(
649      InternalState expectedNextState,
650      char input) {
651    InternalState nextState = expectedNextState;
652    cdataCloseTag.stopRecording();
653    String cdataCloseTagString = cdataCloseTag.getContent();
654    Preconditions.checkState(!cdataCloseTagString.isEmpty()
655        && cdataCloseTagString.charAt(0) == '/');  // Developer error.
656
657    if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
658        && (input == '>' || HtmlUtils.isHtmlSpace(input))) {
659      tag.clear();
660      insideJavascript = false;
661    } else {
662      nextState = CDATA_TEXT;
663    }
664    return nextState;
665  }
666
667
668  // ======================================================= //
669  // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
670  // ======================================================= //
671
672  private static void registerMapping(InternalState internalState,
673                                      ExternalState externalState) {
674    STATE_MAPPING.put(internalState, externalState);
675  }
676
677  private static void initializeStateMapping() {
678    // Each parser implementation must map the error state appropriately.
679    registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
680
681    registerMapping(TEXT, HtmlParser.STATE_TEXT);
682    registerMapping(TAG_START, HtmlParser.STATE_TAG);
683    registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
684    registerMapping(DECL_START, HtmlParser.STATE_TEXT);
685    registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
686    registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
687    registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
688    registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
689    registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
690    registerMapping(PI, HtmlParser.STATE_TEXT);
691    registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
692    registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
693    registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
694    registerMapping(ATTR, HtmlParser.STATE_ATTR);
695    registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
696    registerMapping(VALUE, HtmlParser.STATE_VALUE);
697    registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
698    registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
699    registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
700    registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
701    registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
702    registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
703    registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
704    registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
705    registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
706    registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
707    registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
708    registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
709    registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
710    registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
711    registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
712  }
713
714  private static void registerTransition(String expression,
715                                         InternalState source,
716                                         InternalState to) {
717    // It seems to silly to go through a StateTableTransition here
718    // but it adds extra data checking.
719    StateTableTransition stt = new StateTableTransition(expression,
720                                                        source, to);
721    STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
722                              stt.getTo());
723  }
724
725  // NOTE: The "[:default:]" transition should be registered before any
726  //   other transitions for a given state or it will over-write them.
727  private static void initializeParserStateTable() {
728    registerTransition("[:default:]", CSS_FILE, CSS_FILE);
729    registerTransition("[:default:]", JS_FILE, JS_FILE);
730    registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
731    registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
732    registerTransition(">", CDATA_MAY_CLOSE, TEXT);
733    registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
734    registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
735    registerTransition("!", CDATA_LT, CDATA_COM_START);
736    registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
737    registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
738    registerTransition("<", CDATA_TEXT, CDATA_LT);
739    registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
740    registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
741    registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
742    registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
743    registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
744    registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
745    registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
746    registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
747    registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
748    registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
749    registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
750    registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
751    registerTransition("\"", VALUE_DQ, TAG_SPACE);
752    registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
753    registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
754    registerTransition("[:default:]", VALUE_Q, VALUE_Q);
755    registerTransition("\'", VALUE_Q, TAG_SPACE);
756    registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
757    registerTransition("\'", VALUE_Q_START, TAG_SPACE);
758    registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
759    registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
760    registerTransition(">", VALUE_TEXT, TAG_CLOSE);
761    registerTransition("[:default:]", VALUE, VALUE_TEXT);
762    registerTransition(">", VALUE, TAG_CLOSE);
763    registerTransition(" \t\n\r", VALUE, VALUE);
764    registerTransition("\"", VALUE, VALUE_DQ_START);
765    registerTransition("\'", VALUE, VALUE_Q_START);
766    registerTransition("=", ATTR_SPACE, VALUE);
767    registerTransition("/", ATTR_SPACE, TAG_SPACE);
768    registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
769    registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
770    registerTransition(">", ATTR_SPACE, TAG_CLOSE);
771    registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
772    registerTransition("=", ATTR, VALUE);
773    registerTransition("/", ATTR, TAG_SPACE);
774    registerTransition(">", ATTR, TAG_CLOSE);
775    registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
776    registerTransition("[:default:]", TAG_CLOSE, TEXT);
777    registerTransition("<", TAG_CLOSE, TAG_START);
778    registerTransition("/", TAG_SPACE, TAG_SPACE);
779    registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
780    registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
781    registerTransition(">", TAG_SPACE, TAG_CLOSE);
782    registerTransition("[:default:]", PI_MAY_END, PI);
783    registerTransition(">", PI_MAY_END, TEXT);
784    registerTransition("[:default:]", PI, PI);
785    registerTransition("?", PI, PI_MAY_END);
786    registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
787    registerTransition(">", COM_DASH_DASH, TEXT);
788    registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
789    registerTransition("[:default:]", COM_DASH, COM_BODY);
790    registerTransition("-", COM_DASH, COM_DASH_DASH);
791    registerTransition("[:default:]", COM_BODY, COM_BODY);
792    registerTransition("-", COM_BODY, COM_DASH);
793    registerTransition("[:default:]", COM_OPEN, TEXT);
794    registerTransition("-", COM_OPEN, COM_BODY);
795    registerTransition("[:default:]", DECL_BODY, DECL_BODY);
796    registerTransition(">", DECL_BODY, TEXT);
797    registerTransition("[:default:]", DECL_START, DECL_BODY);
798    registerTransition(">", DECL_START, TEXT);
799    registerTransition("-", DECL_START, COM_OPEN);
800    registerTransition(">", TAG_NAME, TAG_CLOSE);
801    registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
802    registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
803
804    // Manual change to remain in-sync with CL 10597850 in C HtmlParser.
805    registerTransition("[:default:]", TAG_START, TEXT);
806    registerTransition("<", TAG_START, TAG_START);
807    // End of manual change.
808
809    registerTransition("!", TAG_START, DECL_START);
810    registerTransition("?", TAG_START, PI);
811    registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
812    registerTransition("[:default:]", TEXT, TEXT);
813    registerTransition("<", TEXT, TAG_START);
814  }
815}
816