156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/*
256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc.
356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License");
556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License.
656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at
756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0
956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software
1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS,
1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and
1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License.
1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser;
1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/**
2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Methods exposed for HTML parsing of text to facilitate implementation
2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * of Automatic context-aware escaping. The HTML parser also embeds a
2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Javascript parser for processing Javascript fragments. In the future,
2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * it will also embed other specific parsers and hence most likely remain
2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the main interface to callers of this package.
2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Note: These are the exact methods exposed in the original C++ Parser. The
2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * names are simply modified to conform to Java.
2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic interface HtmlParser extends Parser {
3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The Parser Mode requested for parsing a given template.
3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Currently we support:
3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code HTML} for HTML templates.
3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code JS} for javascript templates.
3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code CSS} for Cascading Style-Sheets templates.
3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code HTML_IN_TAG} for HTML templates that consist only of
3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     HTML attribute name and value pairs. This is typically the case for
4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     a template that is being included from a parent template where the
4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     parent template contains the start and the closing of the HTML tag.
4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     This is a special mode, for standard HTML templates please use
4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     {@link #HTML}.
4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     An example of such as template is:
4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <p><code>class="someClass" target="_blank"</code></p>
4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <p>Which could be included from a parent template that contains
4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     an anchor tag, say:</p>
4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <p><code>&lt;a href="/bla" ["INCLUDED_TEMPLATE"]&gt;</code></p>
4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public enum Mode {
5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    HTML,
5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    JS,
5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    CSS,
5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    HTML_IN_TAG
5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Indicates the type of HTML attribute that the parser is currently in or
6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code NONE} if the parser is not currently in an attribute.
6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code URI} is for attributes taking a URI such as "href" and "src".
6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code JS} is for attributes taking javascript such as "onclick".
6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STYLE} is for the "style" attribute.
6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * All other attributes fall under {@code REGULAR}.
6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returned by {@link HtmlParser#getAttributeType()}
6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public enum ATTR_TYPE {
6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    NONE,
7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    REGULAR,
7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    URI,
7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    JS,
7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    STYLE
7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * All the states in which the parser can be. These are external states.
7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The parser has many more internal states that are not exposed and which
7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * are instead mapped to one of these external ones.
8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_TEXT} the parser is in HTML proper.
8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_TAG} the parser is inside an HTML tag name.
8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_COMMENT} the parser is inside an HTML comment.
8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_ATTR} the parser is inside an HTML attribute name.
8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_VALUE} the parser is inside an HTML attribute value.
8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_JS_FILE} the parser is inside javascript code.
8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code STATE_CSS_FILE} the parser is inside CSS code.
8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>All these states map exactly to those exposed in the C++ (original)
8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * version of the HtmlParser.
9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_TEXT =
9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_TEXT");
9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_TAG =
9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_TAG");
9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_COMMENT =
9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_COMMENT");
9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_ATTR =
9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_ATTR");
9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_VALUE =
10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_VALUE");
10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_JS_FILE =
10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_JS_FILE");
10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public final static ExternalState STATE_CSS_FILE =
10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      new ExternalState("STATE_CSS_FILE");
10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if the parser is currently processing Javascript.
10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Such is the case if and only if, the parser is processing an attribute
10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that takes Javascript, a Javascript script block or the parser
11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * is (re)set with {@link Mode#JS}.
11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if the parser is processing Javascript,
11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         {@code false} otherwise
11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inJavascript();
11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if the parser is currently processing
11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a Javascript litteral that is quoted. The caller will typically
12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * invoke this method after determining that the parser is processing
12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Javascript. Knowing whether the element is quoted or not helps
12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * determine which escaping to apply to it when needed.
12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if the parser is inside a quoted
12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         Javascript literal
12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isJavascriptQuoted();
12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if and only if the parser is currently within
13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * an attribute, be it within the attribute name or the attribute value.
13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if inside an attribute
13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inAttribute();
13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if and only if the parser is currently within
14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a CSS context. A CSS context is one of the below:
14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a STYLE tag.
14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a STYLE attribute.
14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Inside a CSS file when the parser was reset in the CSS mode.
14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if the parser is inside CSS
14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean inCss();
15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the type of the attribute that the parser is in
15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute.
15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The caller will typically invoke this method after determining
15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that the parser is processing an attribute.
15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>This is useful to determine which escaping to apply based
15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * on the type of value this attribute expects.
15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return type of the attribute
16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @see HtmlParser.ATTR_TYPE
16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public ATTR_TYPE getAttributeType();
16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if and only if the parser is currently within
16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * an attribute value and that attribute value is quoted.
16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if the attribute value is quoted
17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isAttributeQuoted();
17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the name of the HTML tag if the parser is currently within one.
17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Note that the name may be incomplete if the parser is currently still
17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * parsing the name. Returns an empty {@code String} if the parser is not
17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * in a tag as determined by {@code getCurrentExternalState}.
17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return the name of the HTML tag or an empty {@code String} if we are
18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         not within an HTML tag
18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getTag();
18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the name of the HTML attribute the parser is currently processing.
18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * If the parser is still parsing the name, then the returned name
18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * may be incomplete. Returns an empty {@code String} if the parser is not
18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * in an attribute as determined by {@code getCurrentExternalState}.
19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return the name of the HTML attribute or an empty {@code String}
19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         if we are not within an HTML attribute
19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getAttribute();
19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the value of an HTML attribute if the parser is currently
19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * within one. If the parser is currently parsing the value, the returned
19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value may be incomplete. The caller will typically first determine
20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that the parser is processing a value by calling
20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code getCurrentExternalState}.
20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return the value, could be an empty {@code String} if the parser is not
20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         in an HTML attribute value
20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public String getValue();
20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the current position of the parser within the HTML attribute
21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value, zero being the position of the first character in the value.
21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The caller will typically first determine that the parser is
21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * processing a value by calling {@link #getState()}.
21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return the index or zero if the parser is not processing a value
21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public int getValueIndex();
21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns {@code true} if and only if the current position of the parser is
22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * at the start of a URL HTML attribute value. This is the case when the
22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * following three conditions are all met:
22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>
22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ol>
22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The parser is in an HTML attribute value.
22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The HTML attribute expects a URL, as determined by
22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}.
22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The parser has not yet seen any characters from that URL.
22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ol>
22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p> This method may be used by an Html Sanitizer or an Auto-Escape system
23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * to determine whether to validate the URL for well-formedness and validate
23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe.
23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * In particular, it is recommended to use this method instead of
23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * checking that {@link #getValueIndex()} is {@code 0} to support attribute
23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * types where the URL does not start at index zero, such as the
23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code content} attribute of the {@code meta} HTML tag.
23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if and only if the parser is at the start of the URL
23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public boolean isUrlStart();
24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Resets the state of the parser, allowing for reuse of the
24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code HtmlParser} object.
24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>See the {@link HtmlParser.Mode} enum for information on all
24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the valid modes.
24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param mode is an enum representing the high-level state of the parser
25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public void resetMode(HtmlParser.Mode mode);
25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * A specialized directive to tell the parser there is some content
25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that will be inserted here but that it will not get to parse. Used
25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * by the template system that may not be able to give some content
25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * to the parser but wants it to know there typically will be content
25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * inserted at that point. This is a hint used in corner cases within
25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * parsing of HTML attribute names and values where content we do not
26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * get to see could affect our parsing and alter our current state.
26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Returns {@code false} if and only if the parser encountered
26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a fatal error which prevents it from continuing further parsing.
26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Note: The return value is different from the C++ Parser which
26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * always returns {@code true} but in my opinion makes more sense.
26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws ParseException if an unrecoverable error occurred during parsing
26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public void insertText() throws ParseException;
27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns the state the Javascript parser is in.
27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>See {@link JavascriptParser} for more information on the valid
27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * external states. The caller will typically first determine that the
27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * parser is processing Javascript and then invoke this method to
27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * obtain more fine-grained state information.
27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return external state of the javascript parser
28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public ExternalState getJavascriptState();
28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson}
284