156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/* 256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc. 356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License"); 556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License. 656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at 756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0 956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software 1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS, 1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and 1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License. 1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser.impl; 1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.common.base.Preconditions; 2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.common.collect.Maps; 2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.ExternalState; 2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.HtmlParser; 2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.ParseException; 2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.CharacterRecorder; 2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.EntityResolver; 2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.util.HtmlUtils; 2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.Map; 2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/** 3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A custom specialized parser - ported from the main C++ version - used to 3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * implement context-aware escaping of run-time data in web-application 3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * templates. 3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>This is the main class in the package. It implements the 3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@code HtmlParser} interface. 3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>This class is not thread-safe, in particular you cannot invoke any 3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * state changing operations (such as {@code parse} from multiple threads 4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * on the same object. 4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>If you are looking at this class, chances are very high you are 4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * implementing Auto-Escaping for a new template system. Please see the 4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * landing page including a design document at 4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <a href="http://go/autoescape">Auto-Escape Landing Page</a>. 4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic class HtmlParserImpl extends GenericParser implements HtmlParser { 4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /* 5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Internal representation of the parser state, which is at a 5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * finer-granularity than the external state as given to callers. 5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * The relationship between <code>InternalState</code> and 5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <code>ExternalState</code> is a many-to-one relationship. 5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState TEXT; 5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState TAG_START; 5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState TAG_NAME; 5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState DECL_START; 5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState DECL_BODY; 6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState COM_OPEN; 6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState COM_BODY; 6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState COM_DASH; 6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState COM_DASH_DASH; 6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState PI; 6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState PI_MAY_END; 6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState TAG_SPACE; 6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState TAG_CLOSE; 6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState ATTR; 6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState ATTR_SPACE; 7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE; 7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE_TEXT; 7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE_Q_START; 7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE_Q; 7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE_DQ_START; 7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState VALUE_DQ; 7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_COM_START; 7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_COM_START_DASH; 7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_COM_BODY; 7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_COM_DASH; 8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_COM_DASH_DASH; 8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_TEXT; 8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_LT; 8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CDATA_MAY_CLOSE; 8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState JS_FILE; 8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final InternalState CSS_FILE; 8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson static { 8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson TEXT = InternalState.getInstanceHtml("TEXT"); 8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson TAG_START = InternalState.getInstanceHtml("TAG_START"); 9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson TAG_NAME = InternalState.getInstanceHtml("TAG_NAME"); 9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson DECL_START = InternalState.getInstanceHtml("DECL_START"); 9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson DECL_BODY = InternalState.getInstanceHtml("DECL_BODY"); 9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson COM_OPEN = InternalState.getInstanceHtml("COM_OPEN"); 9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson COM_BODY = InternalState.getInstanceHtml("COM_BODY"); 9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson COM_DASH = InternalState.getInstanceHtml("COM_DASH"); 9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH"); 9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson PI =InternalState.getInstanceHtml("PI"); 9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END"); 9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE"); 10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE"); 10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson ATTR = InternalState.getInstanceHtml("ATTR"); 10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE"); 10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE = InternalState.getInstanceHtml("VALUE"); 10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT"); 10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START"); 10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE_Q = InternalState.getInstanceHtml("VALUE_Q"); 10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START"); 10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ"); 10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START"); 11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_COM_START_DASH = 11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState.getInstanceHtml("CDATA_COM_START_DASH"); 11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY"); 11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH"); 11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH"); 11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT"); 11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_LT = InternalState.getInstanceHtml("CDATA_LT"); 11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE"); 11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson JS_FILE = InternalState.getInstanceHtml("JS_FILE"); 11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson CSS_FILE = InternalState.getInstanceHtml("CSS_FILE"); 12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final Map<InternalState, ExternalState> STATE_MAPPING = 12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson Maps.newHashMap(); 12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson static { 12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson initializeStateMapping(); 12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final ParserStateTable STATE_TABLE = new ParserStateTable(); 12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson static { 13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson initializeParserStateTable(); 13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final CharacterRecorder tag; 13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final CharacterRecorder attr; 13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final CharacterRecorder value; 13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final CharacterRecorder cdataCloseTag; 13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final EntityResolver entityResolver; 13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private final JavascriptParserImpl jsParser; 13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private boolean insideJavascript; 14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private int valueIndex; 14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // True iff InsertText() was called at the start of a URL attribute value. 14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private boolean textInsideUrlValue; 14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Creates an {@code HtmlParserImpl} object. 14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Both for performance reasons and to leverage code a state-flow machine 14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * that is automatically generated from Python for multiple target 14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * languages, this object uses a static {@code ParserStateTable} that 15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * is read-only and obtained from the generated code in {@code HtmlParserFsm}. 15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * That code also maintains the mapping from internal states 15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * ({@code InternalState}) to external states ({@code ExternalState}). 15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public HtmlParserImpl() { 15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson super(STATE_TABLE, STATE_MAPPING, TEXT); 15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag = new CharacterRecorder(); 15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr = new CharacterRecorder(); 15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value = new CharacterRecorder(); 15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag = new CharacterRecorder(); 16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson entityResolver = new EntityResolver(); 16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser = new JavascriptParserImpl(); 16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson valueIndex = 0; 16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson textInsideUrlValue = false; 16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Creates an {@code HtmlParserImpl} that is a copy of the one provided. 16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy 17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) { 17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson super(aHtmlParserImpl); 17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag = new CharacterRecorder(aHtmlParserImpl.tag); 17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr = new CharacterRecorder(aHtmlParserImpl.attr); 17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value = new CharacterRecorder(aHtmlParserImpl.value); 17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag); 17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver); 17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser); 18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = aHtmlParserImpl.insideJavascript; 18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson valueIndex = aHtmlParserImpl.valueIndex; 18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue; 18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean inJavascript() { 18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (insideJavascript 18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && ( (getState() == STATE_VALUE) 18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_TEXT) 19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_START) 19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_START_DASH) 19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_BODY) 19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_DASH) 19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_DASH_DASH) 19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_LT) 19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_MAY_CLOSE) 19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == JS_FILE) )); 19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean isJavascriptQuoted() { 20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (inJavascript()) { 20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson ExternalState jsParserState = jsParser.getState(); 20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (jsParserState == JavascriptParserImpl.STATE_Q 20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || jsParserState == JavascriptParserImpl.STATE_DQ); 20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return false; 20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean inAttribute() { 21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson ExternalState extState = getState(); 21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (extState != null && (extState == STATE_ATTR 21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || extState == STATE_VALUE)); 21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Returns {@code true} if and only if the parser is currently within 21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * a CSS context. A CSS context is one of the below: 22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <ul> 22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>Inside a STYLE tag. 22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>Inside a STYLE attribute. 22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>Inside a CSS file when the parser was reset in the CSS mode. 22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * </ul> 22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return {@code true} if and only if the parser is inside CSS 22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean inCss() { 23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (currentState == CSS_FILE 23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (getState() == STATE_VALUE 23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && (getAttributeType() == ATTR_TYPE.STYLE)) 23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || ("style".equals(getTag()))); 23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public ATTR_TYPE getAttributeType() { 23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson String attribute = getAttribute(); 23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (!inAttribute()) { 24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.NONE; 24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (HtmlUtils.isAttributeJavascript(attribute)) { 24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.JS; 24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (HtmlUtils.isAttributeUri(attribute)) { 24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.URI; 24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (HtmlUtils.isAttributeStyle(attribute)) { 24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.STYLE; 25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Special logic to handle the "content" attribute of the "meta" tag. 25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if ("meta".equals(getTag()) && "content".equals(getAttribute())) { 25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlUtils.META_REDIRECT_TYPE redirectType = 25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlUtils.parseContentAttributeForUrl(getValue()); 25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START || 25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson redirectType == HtmlUtils.META_REDIRECT_TYPE.URL) 25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.URI; 25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return ATTR_TYPE.REGULAR; 26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public ExternalState getJavascriptState() { 26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return jsParser.getState(); 26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean isAttributeQuoted() { 27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (currentState == VALUE_Q_START 27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || currentState == VALUE_Q 27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || currentState == VALUE_DQ_START 27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || currentState == VALUE_DQ); 27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public String getTag() { 27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return tag.getContent().toLowerCase(); 28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public String getAttribute() { 28456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return inAttribute() ? attr.getContent().toLowerCase() : ""; 28556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 28656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 28756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 28856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public String getValue() { 28956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (getState() == STATE_VALUE) ? value.getContent() : ""; 29056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 29156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 29256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 29356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public int getValueIndex() { 29456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (getState() != STATE_VALUE) { 29556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return 0; 29656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 29756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return valueIndex; 29856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 29956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 30056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 30156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public boolean isUrlStart() { 30256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // False when not inside an HTML attribute value 30356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (getState() != STATE_VALUE) { 30456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return false; 30556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 30656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 30756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Or when the HTML attribute is not of URI type. 30856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (getAttributeType() != ATTR_TYPE.URI) { 30956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return false; 31056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 31156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 31256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Or when we received an InsertText() directive at the start of a URL. 31356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (textInsideUrlValue) { 31456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return false; 31556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 31656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 31756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if ("meta".equals(getTag())) { 31856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // At this point, we know we are in the "content" attribute 31956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // or we would not have the URI attribute type. 32056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (HtmlUtils.parseContentAttributeForUrl(getValue()) == 32156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlUtils.META_REDIRECT_TYPE.URL_START); 32256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 32356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 32456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // For all other URI attributes, check if we are at index 0. 32556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return (getValueIndex() == 0); 32656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson} 32756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 32856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 32956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@inheritDoc} 33056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 33156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Resets the state of the parser to a state consistent with the 33256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@code Mode} provided. This will reset finer-grained state 33356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * information back to a default value, hence use only when 33456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you want to parse text from a very clean slate. 33556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 33656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>See the {@link HtmlParser.Mode} enum for information on all 33756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the valid modes. 33856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 33956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param mode is an enum representing the high-level state of the parser 34056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 34156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 34256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public void resetMode(Mode mode) { 34356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 34456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.reset(); 34556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr.reset(); 34656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value.reset(); 34756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag.reset(); 34856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson valueIndex = 0; 34956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson textInsideUrlValue = false; 35056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.reset(); 35156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 35256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson switch (mode) { 35356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case HTML: 35456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson currentState = TEXT; 35556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 35656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case JS: 35756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson currentState = JS_FILE; 35856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = true; 35956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 36056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case CSS: 36156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson currentState = CSS_FILE; 36256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 36356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case HTML_IN_TAG: 36456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson currentState = TAG_SPACE; 36556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 36656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson default: 36756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson throw new IllegalArgumentException("Did not recognize Mode: " + 36856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson mode.toString()); 36956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 37056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 37156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 37256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 37356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Resets the state of the parser to the initial state of parsing HTML. 37456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 37556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public void reset() { 37656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson super.reset(); 37756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson resetMode(Mode.HTML); 37856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 37956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 38056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 38156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A specialized directive to tell the parser there is some content 38256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * that will be inserted here but that it will not get to parse. Used 38356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * by the template system that may not be able to give some content 38456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * to the parser but wants it to know there typically will be content 38556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * inserted at that point. This is a hint used in corner cases within 38656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * parsing of HTML attribute names and values where content we do not 38756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * get to see could affect our parsing and alter our current state. 38856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 38956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>The two cases where {@code #insertText()} affects our parsing are: 39056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <ul> 39156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>We are at the start of the value of a URL-accepting HTML attribute. In 39256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * that case, we change internal state to no longer be considered at the 39356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * start of the URL. This may affect what escaping template systems may want 39456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * to perform on the HTML attribute value. We avoid injecting fake data and 39556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * hence not modify the current index of the value as determined by 39656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@link #getValueIndex()}</li> 39756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>We just transitioned from an attribute name to an attribute value 39856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * (by parsing the separating {@code '='} character). In that case, we 39956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * change internal state to be now inside a non-quoted HTML attribute 40056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * value.</li> 40156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * </ul> 40256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 40356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws ParseException if an unrecoverable error occurred during parsing 40456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 40556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 40656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public void insertText() throws ParseException { 40756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Case: Inside URL attribute value. 40856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (getState() == STATE_VALUE 40956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && getAttributeType() == ATTR_TYPE.URI 41056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && isUrlStart()) { 41156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson textInsideUrlValue = true; 41256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 41356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Case: Before parsing any attribute value. 41456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (currentState == VALUE) { 41556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson setNextState(VALUE_TEXT); 41656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 41756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 41856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 41956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 42056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson protected InternalState handleEnterState(InternalState currentState, 42156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState expectedNextState, 42256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson char input) { 42356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState nextState = expectedNextState; 42456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (currentState == TAG_NAME) { 42556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson enterTagName(); 42656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == ATTR) { 42756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson enterAttribute(); 42856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == TAG_CLOSE) { 42956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson nextState = tagClose(currentState); 43056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == CDATA_MAY_CLOSE) { 43156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson enterStateCdataMayClose(); 43256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == VALUE) { 43356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson enterValue(); 43456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else 43556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (currentState == VALUE_TEXT || currentState == VALUE_Q 43656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || currentState == VALUE_DQ) { 43756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson enterValueContent(); 43856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 43956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return nextState; 44056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 44156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 44256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 44356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson protected InternalState handleExitState(InternalState currentState, 44456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState expectedNextState, 44556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson char input) { 44656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState nextState = expectedNextState; 44756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (currentState == TAG_NAME) { 44856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson exitTagName(); 44956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == ATTR) { 45056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson exitAttribute(); 45156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (currentState == CDATA_MAY_CLOSE) { 45256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson nextState = exitStateCdataMayClose(nextState, input); 45356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else 45456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q) 45556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == VALUE_DQ)) { 45656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson exitValueContent(); 45756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 45856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return nextState; 45956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 46056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 46156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 46256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson protected InternalState handleInState(InternalState currentState, 46356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson char input) throws ParseException { 46456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if ((currentState == CDATA_TEXT) 46556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_START) 46656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_START_DASH) 46756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_BODY) 46856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_DASH) 46956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_COM_DASH_DASH) 47056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_LT) 47156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == CDATA_MAY_CLOSE) 47256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == JS_FILE)) { 47356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson inStateCdata(input); 47456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if ((currentState == VALUE_TEXT) 47556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == VALUE_Q) 47656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || (currentState == VALUE_DQ)) { 47756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson inStateValue(input); 47856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 47956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return currentState; 48056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 48156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 48256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 48356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Invokes recording on all CharacterRecorder objects. Currently we do 48456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * not check that one and only one of them is recording. I did a fair 48556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * bit of testing on the C++ parser and was not convinced there is 48656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * such a guarantee. 48756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 48856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson @Override 48956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson protected void record(char input) { 49056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr.maybeRecord(input); 49156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.maybeRecord(input); 49256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value.maybeRecord(input); 49356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag.maybeRecord(input); 49456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 49556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 49656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 49756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Starts recording the name of the HTML tag. Called when the parser 49856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * enters a new tag. 49956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 50056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void enterTagName() { 50156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.startRecording(); 50256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 50356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 50456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void exitTagName() { 50556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.stopRecording(); 50656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson String tagString = tag.getContent(); 50756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (!tagString.isEmpty() && tagString.charAt(0) == '/') { 50856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.reset(); 50956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 51056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 51156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 51256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 51356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Starts recording the name of the HTML attribute. Called when the parser 51456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * enters a new HTML attribute. 51556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 51656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void enterAttribute() { 51756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr.startRecording(); 51856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 51956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 52056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void exitAttribute() { 52156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson attr.stopRecording(); 52256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 52356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 52456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 52556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Tracks the index within the HTML attribute value and initializes 52656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the javascript parser for attributes that take javascript. 52756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 52856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Called when the parser enters a new HTML attribute value. 52956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 53056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void enterValue() { 53156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson valueIndex = 0; 53256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson textInsideUrlValue = false; 53356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (HtmlUtils.isAttributeJavascript(getAttribute())) { 53456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson entityResolver.reset(); 53556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.reset(); 53656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = true; 53756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else { 53856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 53956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 54056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 54156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 54256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 54356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Starts recordning the contents of the attribute value. 54456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 54556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Called when entering an attribute value. 54656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 54756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void enterValueContent() { 54856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value.startRecording(); 54956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 55056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 55156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 55256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Stops the recording of the attribute value and exits javascript 55356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * (in case we were inside it). 55456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 55556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void exitValueContent() { 55656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson value.stopRecording(); 55756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 55856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 55956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 56056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 56156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Processes javascript after performing entity resolution and updates 56256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the position within the attribute value. 56356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * If the status of the entity resolution is <code>IN_PROGRESS</code>, 56456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * we don't invoke the javascript parser. 56556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 56656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Called for every character inside an attribute value. 56756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 56856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param input character read 56956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws ParseException if an unrecoverable error occurred during parsing 57056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 57156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void inStateValue(char input) throws ParseException { 57256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson valueIndex++; 57356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (insideJavascript) { 57456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson EntityResolver.Status status = entityResolver.processChar(input); 57556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (status == EntityResolver.Status.COMPLETED) { 57656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.parse(entityResolver.getEntity()); 57756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson entityResolver.reset(); 57856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if (status == EntityResolver.Status.NOT_STARTED) { 57956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.parse(input); 58056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 58156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 58256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 58356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 58456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 58556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Handles the tag it finished reading. 58656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 58756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>For a script tag, it initializes the javascript parser. For all 58856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * tags that are recognized to have CDATA values 58956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * (including the script tag), it switches the CDATA state to handle them 59056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * properly. For code simplification, CDATA and RCDATA sections are 59156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * treated the same. 59256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 59356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Called when the parser leaves a tag definition. 59456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 59556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param state current state 59656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return state next state, could be the same as current state 59756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 59856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private InternalState tagClose(InternalState state) { 59956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState nextState = state; 60056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson String tagName = getTag(); 60156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if ("script".equals(tagName)) { 60256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson nextState = CDATA_TEXT; 60356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.reset(); 60456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = true; 60556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else if ("style".equals(tagName) 60656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || "title".equals(tagName) 60756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson || "textarea".equals(tagName)) { 60856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson nextState = CDATA_TEXT; 60956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 61056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 61156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return nextState; 61256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 61356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 61456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 61556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Feeds the character to the javascript parser for processing. 61656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 61756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Called inside CDATA blocks to parse javascript. 61856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 61956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param input character read 62056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws ParseException if an unrecoverable error occurred during parsing 62156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 62256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void inStateCdata(char input) throws ParseException { 62356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (insideJavascript) { 62456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson jsParser.parse(input); 62556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 62656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 62756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 62856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 62956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Starts recording. This is so we find the closing tag name in order to 63056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * know if the tag is going to be closed or not. 63156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 63256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Called when encountering a '<' character in a CDATA section. 63356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 63456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private void enterStateCdataMayClose() { 63556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag.startRecording(); 63656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 63756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 63856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 63956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Determines whether to close the tag element, It closes it if it finds 64056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the corresponding end tag. Called when reading what could be a 64156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * closing CDATA tag. 64256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 64356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param input the character read 64456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param expectedNextState the expected state to go to next 64556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * unless we want to change it here 64656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return the next state to go to 64756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 64856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private InternalState exitStateCdataMayClose( 64956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState expectedNextState, 65056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson char input) { 65156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState nextState = expectedNextState; 65256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson cdataCloseTag.stopRecording(); 65356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson String cdataCloseTagString = cdataCloseTag.getContent(); 65456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson Preconditions.checkState(!cdataCloseTagString.isEmpty() 65556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && cdataCloseTagString.charAt(0) == '/'); // Developer error. 65656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 65756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag()) 65856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson && (input == '>' || HtmlUtils.isHtmlSpace(input))) { 65956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson tag.clear(); 66056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson insideJavascript = false; 66156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else { 66256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson nextState = CDATA_TEXT; 66356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 66456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return nextState; 66556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 66656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 66756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 66856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // ======================================================= // 66956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. // 67056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // ======================================================= // 67156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 67256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void registerMapping(InternalState internalState, 67356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson ExternalState externalState) { 67456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson STATE_MAPPING.put(internalState, externalState); 67556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 67656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 67756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void initializeStateMapping() { 67856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Each parser implementation must map the error state appropriately. 67956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR); 68056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 68156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(TEXT, HtmlParser.STATE_TEXT); 68256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(TAG_START, HtmlParser.STATE_TAG); 68356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(TAG_NAME, HtmlParser.STATE_TAG); 68456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(DECL_START, HtmlParser.STATE_TEXT); 68556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(DECL_BODY, HtmlParser.STATE_TEXT); 68656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(COM_OPEN, HtmlParser.STATE_TEXT); 68756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(COM_BODY, HtmlParser.STATE_COMMENT); 68856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(COM_DASH, HtmlParser.STATE_COMMENT); 68956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT); 69056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(PI, HtmlParser.STATE_TEXT); 69156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT); 69256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(TAG_SPACE, HtmlParser.STATE_TAG); 69356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT); 69456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(ATTR, HtmlParser.STATE_ATTR); 69556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR); 69656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE, HtmlParser.STATE_VALUE); 69756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE); 69856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE); 69956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE_Q, HtmlParser.STATE_VALUE); 70056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE); 70156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE); 70256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT); 70356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT); 70456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT); 70556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT); 70656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT); 70756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT); 70856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_LT, HtmlParser.STATE_TEXT); 70956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT); 71056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE); 71156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE); 71256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 71356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 71456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void registerTransition(String expression, 71556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState source, 71656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson InternalState to) { 71756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // It seems to silly to go through a StateTableTransition here 71856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // but it adds extra data checking. 71956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson StateTableTransition stt = new StateTableTransition(expression, 72056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson source, to); 72156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(), 72256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson stt.getTo()); 72356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 72456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 72556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // NOTE: The "[:default:]" transition should be registered before any 72656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // other transitions for a given state or it will over-write them. 72756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void initializeParserStateTable() { 72856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CSS_FILE, CSS_FILE); 72956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", JS_FILE, JS_FILE); 73056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT); 73156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE); 73256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", CDATA_MAY_CLOSE, TEXT); 73356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE); 73456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_LT, CDATA_TEXT); 73556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("!", CDATA_LT, CDATA_COM_START); 73656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE); 73756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT); 73856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("<", CDATA_TEXT, CDATA_LT); 73956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY); 74056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT); 74156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH); 74256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY); 74356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH); 74456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY); 74556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH); 74656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT); 74756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY); 74856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT); 74956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH); 75056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE_DQ, VALUE_DQ); 75156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\"", VALUE_DQ, TAG_SPACE); 75256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ); 75356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\"", VALUE_DQ_START, TAG_SPACE); 75456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE_Q, VALUE_Q); 75556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\'", VALUE_Q, TAG_SPACE); 75656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE_Q_START, VALUE_Q); 75756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\'", VALUE_Q_START, TAG_SPACE); 75856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT); 75956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE); 76056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", VALUE_TEXT, TAG_CLOSE); 76156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", VALUE, VALUE_TEXT); 76256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", VALUE, TAG_CLOSE); 76356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", VALUE, VALUE); 76456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\"", VALUE, VALUE_DQ_START); 76556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("\'", VALUE, VALUE_Q_START); 76656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("=", ATTR_SPACE, VALUE); 76756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("/", ATTR_SPACE, TAG_SPACE); 76856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR); 76956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE); 77056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", ATTR_SPACE, TAG_CLOSE); 77156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", ATTR, ATTR_SPACE); 77256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("=", ATTR, VALUE); 77356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("/", ATTR, TAG_SPACE); 77456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", ATTR, TAG_CLOSE); 77556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9_:.-", ATTR, ATTR); 77656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", TAG_CLOSE, TEXT); 77756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("<", TAG_CLOSE, TAG_START); 77856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("/", TAG_SPACE, TAG_SPACE); 77956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR); 78056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE); 78156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", TAG_SPACE, TAG_CLOSE); 78256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", PI_MAY_END, PI); 78356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", PI_MAY_END, TEXT); 78456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", PI, PI); 78556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("?", PI, PI_MAY_END); 78656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", COM_DASH_DASH, COM_BODY); 78756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", COM_DASH_DASH, TEXT); 78856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", COM_DASH_DASH, COM_DASH_DASH); 78956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", COM_DASH, COM_BODY); 79056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", COM_DASH, COM_DASH_DASH); 79156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", COM_BODY, COM_BODY); 79256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", COM_BODY, COM_DASH); 79356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", COM_OPEN, TEXT); 79456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", COM_OPEN, COM_BODY); 79556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", DECL_BODY, DECL_BODY); 79656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", DECL_BODY, TEXT); 79756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", DECL_START, DECL_BODY); 79856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", DECL_START, TEXT); 79956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("-", DECL_START, COM_OPEN); 80056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(">", TAG_NAME, TAG_CLOSE); 80156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE); 80256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME); 80356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 80456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Manual change to remain in-sync with CL 10597850 in C HtmlParser. 80556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", TAG_START, TEXT); 80656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("<", TAG_START, TAG_START); 80756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // End of manual change. 80856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 80956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("!", TAG_START, DECL_START); 81056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("?", TAG_START, PI); 81156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME); 81256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("[:default:]", TEXT, TEXT); 81356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson registerTransition("<", TEXT, TAG_START); 81456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 81556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson} 816