156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/*
256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc.
356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License");
556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License.
656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at
756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0
956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software
1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS,
1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and
1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License.
1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser.impl;
1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.common.base.Preconditions;
2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.common.collect.Maps;
2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.ExternalState;
2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.HtmlParser;
2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.ParseException;
2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.CharacterRecorder;
2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.EntityResolver;
2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.HtmlUtils;
2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.Map;
2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/**
3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A custom specialized parser - ported from the main C++ version - used to
3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * implement context-aware escaping of run-time data in web-application
3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * templates.
3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>This is the main class in the package. It implements the
3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@code HtmlParser} interface.
3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>This class is not thread-safe, in particular you cannot invoke any
3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * state changing operations (such as {@code parse} from multiple threads
4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * on the same object.
4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>If you are looking at this class, chances are very high you are
4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * implementing Auto-Escaping for a new template system. Please see the
4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * landing page including a design document at
4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <a href="http://go/autoescape">Auto-Escape Landing Page</a>.
4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic class HtmlParserImpl extends GenericParser implements HtmlParser {
4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /*
5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Internal representation of the parser state, which is at a
5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * finer-granularity than the external state as given to callers.
5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The relationship between <code>InternalState</code> and
5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <code>ExternalState</code> is a many-to-one relationship.
5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState TEXT;
5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState TAG_START;
5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState TAG_NAME;
5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState DECL_START;
5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState DECL_BODY;
6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState COM_OPEN;
6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState COM_BODY;
6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState COM_DASH;
6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState COM_DASH_DASH;
6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState PI;
6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState PI_MAY_END;
6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState TAG_SPACE;
6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState TAG_CLOSE;
6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState ATTR;
6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState ATTR_SPACE;
7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE;
7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE_TEXT;
7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE_Q_START;
7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE_Q;
7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE_DQ_START;
7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState VALUE_DQ;
7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_COM_START;
7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_COM_START_DASH;
7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_COM_BODY;
7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_COM_DASH;
8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_COM_DASH_DASH;
8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_TEXT;
8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_LT;
8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CDATA_MAY_CLOSE;
8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState JS_FILE;
8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final InternalState CSS_FILE;
8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  static {
8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    TEXT = InternalState.getInstanceHtml("TEXT");
8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    TAG_START = InternalState.getInstanceHtml("TAG_START");
9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    DECL_START = InternalState.getInstanceHtml("DECL_START");
9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    COM_BODY = InternalState.getInstanceHtml("COM_BODY");
9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    COM_DASH = InternalState.getInstanceHtml("COM_DASH");
9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    PI =InternalState.getInstanceHtml("PI");
9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    ATTR = InternalState.getInstanceHtml("ATTR");
10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE = InternalState.getInstanceHtml("VALUE");
10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_COM_START_DASH =
11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        InternalState.getInstanceHtml("CDATA_COM_START_DASH");
11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    JS_FILE = InternalState.getInstanceHtml("JS_FILE");
11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Map<InternalState, ExternalState> STATE_MAPPING =
12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      Maps.newHashMap();
12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  static {
12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    initializeStateMapping();
12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final ParserStateTable STATE_TABLE = new ParserStateTable();
12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  static {
13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    initializeParserStateTable();
13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final CharacterRecorder tag;
13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final CharacterRecorder attr;
13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final CharacterRecorder value;
13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final CharacterRecorder cdataCloseTag;
13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final EntityResolver entityResolver;
13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private final JavascriptParserImpl jsParser;
13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private boolean insideJavascript;
14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private int valueIndex;
14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // True iff InsertText() was called at the start of a URL attribute value.
14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private boolean textInsideUrlValue;
14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Creates an {@code HtmlParserImpl} object.
14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Both for performance reasons and to leverage code a state-flow machine
14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that is automatically generated from Python for multiple target
14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * languages, this object uses a static {@code ParserStateTable} that
15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * is read-only and obtained from the generated code in {@code HtmlParserFsm}.
15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * That code also maintains the mapping from internal states
15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * ({@code InternalState}) to external states ({@code ExternalState}).
15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public HtmlParserImpl() {
15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    super(STATE_TABLE, STATE_MAPPING, TEXT);
15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag = new CharacterRecorder();
15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr = new CharacterRecorder();
15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value = new CharacterRecorder();
15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag = new CharacterRecorder();
16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    entityResolver = new EntityResolver();
16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    jsParser = new JavascriptParserImpl();
16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    insideJavascript = false;
16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    valueIndex = 0;
16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    textInsideUrlValue = false;
16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Creates an {@code HtmlParserImpl} that is a copy of the one provided.
16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    super(aHtmlParserImpl);
17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag = new CharacterRecorder(aHtmlParserImpl.tag);
17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr = new CharacterRecorder(aHtmlParserImpl.attr);
17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value = new CharacterRecorder(aHtmlParserImpl.value);
17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    insideJavascript = aHtmlParserImpl.insideJavascript;
18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    valueIndex = aHtmlParserImpl.valueIndex;
18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inJavascript() {
18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (insideJavascript
18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            && ( (getState() == STATE_VALUE)
18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_TEXT)
19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_COM_START)
19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_COM_START_DASH)
19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_COM_BODY)
19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_COM_DASH)
19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_COM_DASH_DASH)
19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_LT)
19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == CDATA_MAY_CLOSE)
19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || (currentState == JS_FILE) ));
19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isJavascriptQuoted() {
20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (inJavascript()) {
20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      ExternalState jsParserState = jsParser.getState();
20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return (jsParserState == JavascriptParserImpl.STATE_Q
20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson              || jsParserState == JavascriptParserImpl.STATE_DQ);
20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return false;
20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inAttribute() {
21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    ExternalState extState = getState();
21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (extState != null && (extState == STATE_ATTR
21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                 || extState == STATE_VALUE));
21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if and only if the parser is currently within
21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a CSS context. A CSS context is one of the below:
22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a STYLE tag.
22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a STYLE attribute.
22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a CSS file when the parser was reset in the CSS mode.
22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if the parser is inside CSS
22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inCss() {
23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (currentState == CSS_FILE
23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            || (getState() == STATE_VALUE
23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                && (getAttributeType() == ATTR_TYPE.STYLE))
23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            || ("style".equals(getTag())));
23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public ATTR_TYPE getAttributeType() {
23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    String attribute = getAttribute();
23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (!inAttribute()) {
24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return ATTR_TYPE.NONE;
24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (HtmlUtils.isAttributeJavascript(attribute)) {
24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return ATTR_TYPE.JS;
24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (HtmlUtils.isAttributeUri(attribute)) {
24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return ATTR_TYPE.URI;
24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (HtmlUtils.isAttributeStyle(attribute)) {
24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return ATTR_TYPE.STYLE;
25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Special logic to handle the "content" attribute of the "meta" tag.
25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      HtmlUtils.META_REDIRECT_TYPE redirectType =
25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          HtmlUtils.parseContentAttributeForUrl(getValue());
25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        return ATTR_TYPE.URI;
25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return ATTR_TYPE.REGULAR;
26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public ExternalState getJavascriptState() {
26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return jsParser.getState();
26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isAttributeQuoted() {
27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (currentState == VALUE_Q_START
27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            || currentState == VALUE_Q
27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            || currentState == VALUE_DQ_START
27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            || currentState == VALUE_DQ);
27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getTag() {
27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return tag.getContent().toLowerCase();
28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getAttribute() {
28456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return inAttribute() ? attr.getContent().toLowerCase() : "";
28556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
28656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
28756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
28856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getValue() {
28956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (getState() == STATE_VALUE) ? value.getContent() : "";
29056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
29156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
29256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
29356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public int getValueIndex() {
29456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (getState() != STATE_VALUE) {
29556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return 0;
29656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
29756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return valueIndex;
29856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
29956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
30056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
30156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isUrlStart() {
30256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // False when not inside an HTML attribute value
30356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (getState() != STATE_VALUE) {
30456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return false;
30556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
30656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
30756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    //  Or when the HTML attribute is not of URI type.
30856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (getAttributeType() != ATTR_TYPE.URI) {
30956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return false;
31056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
31156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
31256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Or when we received an InsertText() directive at the start of a URL.
31356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (textInsideUrlValue) {
31456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return false;
31556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
31656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
31756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if ("meta".equals(getTag())) {
31856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      // At this point, we know we are in the "content" attribute
31956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      // or we would not have the URI attribute type.
32056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
32156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson              HtmlUtils.META_REDIRECT_TYPE.URL_START);
32256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
32356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
32456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // For all other URI attributes, check if we are at index 0.
32556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return (getValueIndex() == 0);
32656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson}
32756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
32856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
32956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@inheritDoc}
33056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
33156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Resets the state of the parser to a state consistent with the
33256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code Mode} provided. This will reset finer-grained state
33356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * information back to a default value, hence use only when
33456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * you want to parse text from a very clean slate.
33556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
33656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>See the {@link HtmlParser.Mode} enum for information on all
33756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the valid modes.
33856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
33956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param mode is an enum representing the high-level state of the parser
34056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
34156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
34256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public void resetMode(Mode mode) {
34356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    insideJavascript = false;
34456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag.reset();
34556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr.reset();
34656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value.reset();
34756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag.reset();
34856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    valueIndex = 0;
34956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    textInsideUrlValue = false;
35056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    jsParser.reset();
35156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
35256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    switch (mode) {
35356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case HTML:
35456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        currentState = TEXT;
35556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
35656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case JS:
35756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        currentState = JS_FILE;
35856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        insideJavascript = true;
35956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
36056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case CSS:
36156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        currentState = CSS_FILE;
36256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
36356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case HTML_IN_TAG:
36456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        currentState = TAG_SPACE;
36556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
36656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      default:
36756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        throw new IllegalArgumentException("Did not recognize Mode: " +
36856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                           mode.toString());
36956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
37056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
37156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
37256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
37356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Resets the state of the parser to the initial state of parsing HTML.
37456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
37556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public void reset() {
37656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    super.reset();
37756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    resetMode(Mode.HTML);
37856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
37956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
38056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
38156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * A specialized directive to tell the parser there is some content
38256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that will be inserted here but that it will not get to parse. Used
38356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * by the template system that may not be able to give some content
38456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * to the parser but wants it to know there typically will be content
38556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * inserted at that point.  This is a hint used in corner cases within
38656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * parsing of HTML attribute names and values where content we do not
38756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * get to see could affect our parsing and alter our current state.
38856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
38956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>The two cases where {@code #insertText()} affects our parsing are:
39056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
39156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>We are at the start of the value of a URL-accepting HTML attribute. In
39256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that case, we change internal state to no longer be considered at the
39356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * start of the URL. This may affect what escaping template systems may want
39456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * to perform on the HTML attribute value. We avoid injecting fake data and
39556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * hence not modify the current index of the value as determined by
39656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@link #getValueIndex()}</li>
39756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>We just transitioned from an attribute name to an attribute value
39856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * (by parsing the separating {@code '='} character). In that case, we
39956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * change internal state to be now inside a non-quoted HTML attribute
40056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value.</li>
40156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
40256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
40356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws ParseException if an unrecoverable error occurred during parsing
40456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
40556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
40656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public void insertText() throws ParseException {
40756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Case: Inside URL attribute value.
40856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (getState() == STATE_VALUE
40956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        && getAttributeType() == ATTR_TYPE.URI
41056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        && isUrlStart()) {
41156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      textInsideUrlValue = true;
41256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
41356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Case: Before parsing any attribute value.
41456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (currentState == VALUE) {
41556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      setNextState(VALUE_TEXT);
41656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
41756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
41856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
41956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
42056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  protected InternalState handleEnterState(InternalState currentState,
42156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                           InternalState expectedNextState,
42256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                           char input) {
42356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    InternalState nextState = expectedNextState;
42456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (currentState == TAG_NAME) {
42556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      enterTagName();
42656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == ATTR) {
42756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      enterAttribute();
42856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == TAG_CLOSE) {
42956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      nextState = tagClose(currentState);
43056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == CDATA_MAY_CLOSE) {
43156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      enterStateCdataMayClose();
43256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == VALUE) {
43356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      enterValue();
43456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else
43556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (currentState == VALUE_TEXT || currentState == VALUE_Q
43656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || currentState == VALUE_DQ) {
43756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      enterValueContent();
43856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
43956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return nextState;
44056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
44156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
44256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
44356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  protected InternalState handleExitState(InternalState currentState,
44456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                          InternalState expectedNextState,
44556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                          char input) {
44656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    InternalState nextState = expectedNextState;
44756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (currentState == TAG_NAME) {
44856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      exitTagName();
44956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == ATTR) {
45056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      exitAttribute();
45156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (currentState == CDATA_MAY_CLOSE) {
45256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      nextState = exitStateCdataMayClose(nextState, input);
45356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else
45456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
45556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == VALUE_DQ)) {
45656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      exitValueContent();
45756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
45856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return nextState;
45956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
46056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
46156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
46256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  protected InternalState handleInState(InternalState currentState,
46356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                        char input) throws ParseException {
46456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if ((currentState == CDATA_TEXT)
46556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_COM_START)
46656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_COM_START_DASH)
46756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_COM_BODY)
46856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_COM_DASH)
46956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_COM_DASH_DASH)
47056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_LT)
47156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == CDATA_MAY_CLOSE)
47256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (currentState == JS_FILE)) {
47356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      inStateCdata(input);
47456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if ((currentState == VALUE_TEXT)
47556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson               || (currentState == VALUE_Q)
47656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson               || (currentState == VALUE_DQ)) {
47756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      inStateValue(input);
47856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
47956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return currentState;
48056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
48156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
48256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
48356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Invokes recording on all CharacterRecorder objects. Currently we do
48456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * not check that one and only one of them is recording. I did a fair
48556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * bit of testing on the C++ parser and was not convinced there is
48656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * such a guarantee.
48756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
48856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  @Override
48956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  protected void record(char input) {
49056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr.maybeRecord(input);
49156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag.maybeRecord(input);
49256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value.maybeRecord(input);
49356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag.maybeRecord(input);
49456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
49556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
49656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
49756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Starts recording the name of the HTML tag. Called when the parser
49856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * enters a new tag.
49956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
50056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void enterTagName() {
50156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag.startRecording();
50256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
50356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
50456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void exitTagName() {
50556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    tag.stopRecording();
50656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    String tagString = tag.getContent();
50756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
50856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      tag.reset();
50956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
51056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
51156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
51256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
51356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Starts recording the name of the HTML attribute. Called when the parser
51456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * enters a new HTML attribute.
51556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
51656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void enterAttribute() {
51756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr.startRecording();
51856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
51956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
52056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void exitAttribute() {
52156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    attr.stopRecording();
52256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
52356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
52456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
52556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Tracks the index within the HTML attribute value and initializes
52656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the javascript parser for attributes that take javascript.
52756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
52856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Called when the parser enters a new HTML attribute value.
52956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
53056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void enterValue() {
53156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    valueIndex = 0;
53256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    textInsideUrlValue = false;
53356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (HtmlUtils.isAttributeJavascript(getAttribute())) {
53456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      entityResolver.reset();
53556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      jsParser.reset();
53656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      insideJavascript = true;
53756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else {
53856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      insideJavascript = false;
53956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
54056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
54156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
54256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
54356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Starts recordning the contents of the attribute value.
54456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
54556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Called when entering an attribute value.
54656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
54756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void enterValueContent() {
54856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value.startRecording();
54956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
55056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
55156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
55256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Stops the recording of the attribute value and exits javascript
55356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * (in case we were inside it).
55456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
55556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void exitValueContent() {
55656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    value.stopRecording();
55756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    insideJavascript = false;
55856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
55956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
56056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
56156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Processes javascript after performing entity resolution and updates
56256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the position within the attribute value.
56356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * If the status of the entity resolution is <code>IN_PROGRESS</code>,
56456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * we don't invoke the javascript parser.
56556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
56656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Called for every character inside an attribute value.
56756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
56856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param input character read
56956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws ParseException if an unrecoverable error occurred during parsing
57056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
57156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void inStateValue(char input) throws ParseException {
57256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    valueIndex++;
57356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (insideJavascript) {
57456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      EntityResolver.Status status = entityResolver.processChar(input);
57556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      if (status == EntityResolver.Status.COMPLETED) {
57656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        jsParser.parse(entityResolver.getEntity());
57756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        entityResolver.reset();
57856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      } else if (status == EntityResolver.Status.NOT_STARTED) {
57956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        jsParser.parse(input);
58056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      }
58156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
58256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
58356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
58456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
58556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Handles the tag it finished reading.
58656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
58756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>For a script tag, it initializes the javascript parser. For all
58856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * tags that are recognized to have CDATA values
58956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * (including the script tag), it switches the CDATA state to handle them
59056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * properly. For code simplification, CDATA and RCDATA sections are
59156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * treated the same.
59256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
59356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Called when the parser leaves a tag definition.
59456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
59556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param state current state
59656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return state next state, could be the same as current state
59756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
59856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private InternalState tagClose(InternalState state) {
59956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    InternalState nextState = state;
60056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    String tagName = getTag();
60156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if ("script".equals(tagName)) {
60256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      nextState = CDATA_TEXT;
60356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      jsParser.reset();
60456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      insideJavascript = true;
60556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if ("style".equals(tagName)
60656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || "title".equals(tagName)
60756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                 || "textarea".equals(tagName)) {
60856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      nextState = CDATA_TEXT;
60956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      insideJavascript = false;
61056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
61156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return nextState;
61256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
61356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
61456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
61556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Feeds the character to the javascript parser for processing.
61656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
61756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Called inside CDATA blocks to parse javascript.
61856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
61956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param input character read
62056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws ParseException if an unrecoverable error occurred during parsing
62156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
62256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void inStateCdata(char input) throws ParseException {
62356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (insideJavascript) {
62456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      jsParser.parse(input);
62556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
62656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
62756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
62856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
62956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Starts recording. This is so we find the closing tag name in order to
63056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * know if the tag is going to be closed or not.
63156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
63256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Called when encountering a '<' character in a CDATA section.
63356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
63456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private void enterStateCdataMayClose() {
63556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag.startRecording();
63656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
63756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
63856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
63956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines whether to close the tag element, It closes it if it finds
64056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the corresponding end tag. Called when reading what could be a
64156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * closing CDATA tag.
64256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
64356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param input the character read
64456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param expectedNextState the expected state to go to next
64556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *        unless we want to change it here
64656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return the next state to go to
64756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
64856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private InternalState exitStateCdataMayClose(
64956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      InternalState expectedNextState,
65056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      char input) {
65156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    InternalState nextState = expectedNextState;
65256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    cdataCloseTag.stopRecording();
65356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    String cdataCloseTagString = cdataCloseTag.getContent();
65456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    Preconditions.checkState(!cdataCloseTagString.isEmpty()
65556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        && cdataCloseTagString.charAt(0) == '/');  // Developer error.
65656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
65756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
65856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        && (input == '>' || HtmlUtils.isHtmlSpace(input))) {
65956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      tag.clear();
66056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      insideJavascript = false;
66156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else {
66256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      nextState = CDATA_TEXT;
66356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
66456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return nextState;
66556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
66656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
66756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
66856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // ======================================================= //
66956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
67056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // ======================================================= //
67156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
67256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void registerMapping(InternalState internalState,
67356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                      ExternalState externalState) {
67456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    STATE_MAPPING.put(internalState, externalState);
67556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
67656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
67756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void initializeStateMapping() {
67856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Each parser implementation must map the error state appropriately.
67956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
68056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
68156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(TEXT, HtmlParser.STATE_TEXT);
68256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(TAG_START, HtmlParser.STATE_TAG);
68356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
68456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(DECL_START, HtmlParser.STATE_TEXT);
68556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
68656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
68756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
68856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
68956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
69056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(PI, HtmlParser.STATE_TEXT);
69156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
69256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
69356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
69456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(ATTR, HtmlParser.STATE_ATTR);
69556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
69656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE, HtmlParser.STATE_VALUE);
69756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
69856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
69956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
70056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
70156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
70256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
70356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
70456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
70556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
70656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
70756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
70856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
70956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
71056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
71156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
71256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
71356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
71456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void registerTransition(String expression,
71556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                         InternalState source,
71656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                         InternalState to) {
71756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // It seems to silly to go through a StateTableTransition here
71856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // but it adds extra data checking.
71956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    StateTableTransition stt = new StateTableTransition(expression,
72056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                                        source, to);
72156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
72256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                              stt.getTo());
72356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
72456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
72556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // NOTE: The "[:default:]" transition should be registered before any
72656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  //   other transitions for a given state or it will over-write them.
72756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void initializeParserStateTable() {
72856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CSS_FILE, CSS_FILE);
72956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", JS_FILE, JS_FILE);
73056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
73156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
73256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", CDATA_MAY_CLOSE, TEXT);
73356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
73456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
73556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("!", CDATA_LT, CDATA_COM_START);
73656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
73756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
73856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("<", CDATA_TEXT, CDATA_LT);
73956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
74056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
74156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
74256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
74356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
74456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
74556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
74656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
74756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
74856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
74956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
75056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
75156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\"", VALUE_DQ, TAG_SPACE);
75256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
75356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
75456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE_Q, VALUE_Q);
75556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\'", VALUE_Q, TAG_SPACE);
75656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
75756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\'", VALUE_Q_START, TAG_SPACE);
75856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
75956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
76056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", VALUE_TEXT, TAG_CLOSE);
76156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", VALUE, VALUE_TEXT);
76256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", VALUE, TAG_CLOSE);
76356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", VALUE, VALUE);
76456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\"", VALUE, VALUE_DQ_START);
76556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("\'", VALUE, VALUE_Q_START);
76656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("=", ATTR_SPACE, VALUE);
76756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("/", ATTR_SPACE, TAG_SPACE);
76856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
76956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
77056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", ATTR_SPACE, TAG_CLOSE);
77156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
77256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("=", ATTR, VALUE);
77356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("/", ATTR, TAG_SPACE);
77456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", ATTR, TAG_CLOSE);
77556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
77656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", TAG_CLOSE, TEXT);
77756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("<", TAG_CLOSE, TAG_START);
77856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("/", TAG_SPACE, TAG_SPACE);
77956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
78056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
78156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", TAG_SPACE, TAG_CLOSE);
78256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", PI_MAY_END, PI);
78356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", PI_MAY_END, TEXT);
78456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", PI, PI);
78556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("?", PI, PI_MAY_END);
78656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
78756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", COM_DASH_DASH, TEXT);
78856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
78956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", COM_DASH, COM_BODY);
79056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", COM_DASH, COM_DASH_DASH);
79156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", COM_BODY, COM_BODY);
79256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", COM_BODY, COM_DASH);
79356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", COM_OPEN, TEXT);
79456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", COM_OPEN, COM_BODY);
79556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", DECL_BODY, DECL_BODY);
79656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", DECL_BODY, TEXT);
79756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", DECL_START, DECL_BODY);
79856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", DECL_START, TEXT);
79956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("-", DECL_START, COM_OPEN);
80056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(">", TAG_NAME, TAG_CLOSE);
80156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
80256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
80356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
80456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Manual change to remain in-sync with CL 10597850 in C HtmlParser.
80556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", TAG_START, TEXT);
80656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("<", TAG_START, TAG_START);
80756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // End of manual change.
80856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
80956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("!", TAG_START, DECL_START);
81056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("?", TAG_START, PI);
81156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
81256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("[:default:]", TEXT, TEXT);
81356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    registerTransition("<", TEXT, TAG_START);
81456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
81556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson}
816