HtmlParser.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser;
18
19/**
20 * Methods exposed for HTML parsing of text to facilitate implementation
21 * of Automatic context-aware escaping. The HTML parser also embeds a
22 * Javascript parser for processing Javascript fragments. In the future,
23 * it will also embed other specific parsers and hence most likely remain
24 * the main interface to callers of this package.
25 *
26 * <p>Note: These are the exact methods exposed in the original C++ Parser. The
27 * names are simply modified to conform to Java.
28 */
29public interface HtmlParser extends Parser {
30
31  /**
32   * The Parser Mode requested for parsing a given template.
33   * Currently we support:
34   * <ul>
35   * <li>{@code HTML} for HTML templates.
36   * <li>{@code JS} for javascript templates.
37   * <li>{@code CSS} for Cascading Style-Sheets templates.
38   * <li>{@code HTML_IN_TAG} for HTML templates that consist only of
39   *     HTML attribute name and value pairs. This is typically the case for
40   *     a template that is being included from a parent template where the
41   *     parent template contains the start and the closing of the HTML tag.
42   *     This is a special mode, for standard HTML templates please use
43   *     {@link #HTML}.
44   *     An example of such as template is:
45   *     <p><code>class="someClass" target="_blank"</code></p>
46   *     <p>Which could be included from a parent template that contains
47   *     an anchor tag, say:</p>
48   *     <p><code>&lt;a href="/bla" ["INCLUDED_TEMPLATE"]&gt;</code></p>
49   * </ul>
50   */
51  public enum Mode {
52    HTML,
53    JS,
54    CSS,
55    HTML_IN_TAG
56  }
57
58  /**
59   * Indicates the type of HTML attribute that the parser is currently in or
60   * {@code NONE} if the parser is not currently in an attribute.
61   * {@code URI} is for attributes taking a URI such as "href" and "src".
62   * {@code JS} is for attributes taking javascript such as "onclick".
63   * {@code STYLE} is for the "style" attribute.
64   * All other attributes fall under {@code REGULAR}.
65   *
66   * Returned by {@link HtmlParser#getAttributeType()}
67   */
68  public enum ATTR_TYPE {
69    NONE,
70    REGULAR,
71    URI,
72    JS,
73    STYLE
74  }
75
76  /**
77   * All the states in which the parser can be. These are external states.
78   * The parser has many more internal states that are not exposed and which
79   * are instead mapped to one of these external ones.
80   * {@code STATE_TEXT} the parser is in HTML proper.
81   * {@code STATE_TAG} the parser is inside an HTML tag name.
82   * {@code STATE_COMMENT} the parser is inside an HTML comment.
83   * {@code STATE_ATTR} the parser is inside an HTML attribute name.
84   * {@code STATE_VALUE} the parser is inside an HTML attribute value.
85   * {@code STATE_JS_FILE} the parser is inside javascript code.
86   * {@code STATE_CSS_FILE} the parser is inside CSS code.
87   *
88   * <p>All these states map exactly to those exposed in the C++ (original)
89   * version of the HtmlParser.
90   */
91  public final static ExternalState STATE_TEXT =
92      new ExternalState("STATE_TEXT");
93  public final static ExternalState STATE_TAG =
94      new ExternalState("STATE_TAG");
95  public final static ExternalState STATE_COMMENT =
96      new ExternalState("STATE_COMMENT");
97  public final static ExternalState STATE_ATTR =
98      new ExternalState("STATE_ATTR");
99  public final static ExternalState STATE_VALUE =
100      new ExternalState("STATE_VALUE");
101  public final static ExternalState STATE_JS_FILE =
102      new ExternalState("STATE_JS_FILE");
103  public final static ExternalState STATE_CSS_FILE =
104      new ExternalState("STATE_CSS_FILE");
105
106  /**
107   * Returns {@code true} if the parser is currently processing Javascript.
108   * Such is the case if and only if, the parser is processing an attribute
109   * that takes Javascript, a Javascript script block or the parser
110   * is (re)set with {@link Mode#JS}.
111   *
112   * @return {@code true} if the parser is processing Javascript,
113   *         {@code false} otherwise
114   */
115  public boolean inJavascript();
116
117  /**
118   * Returns {@code true} if the parser is currently processing
119   * a Javascript litteral that is quoted. The caller will typically
120   * invoke this method after determining that the parser is processing
121   * Javascript. Knowing whether the element is quoted or not helps
122   * determine which escaping to apply to it when needed.
123   *
124   * @return {@code true} if and only if the parser is inside a quoted
125   *         Javascript literal
126   */
127  public boolean isJavascriptQuoted();
128
129
130  /**
131   * Returns {@code true} if and only if the parser is currently within
132   * an attribute, be it within the attribute name or the attribute value.
133   *
134   * @return {@code true} if and only if inside an attribute
135   */
136  public boolean inAttribute();
137
138  /**
139   * Returns {@code true} if and only if the parser is currently within
140   * a CSS context. A CSS context is one of the below:
141   * <ul>
142   * <li>Inside a STYLE tag.
143   * <li>Inside a STYLE attribute.
144   * <li>Inside a CSS file when the parser was reset in the CSS mode.
145   * </ul>
146   *
147   * @return {@code true} if and only if the parser is inside CSS
148   */
149  public boolean inCss();
150
151  /**
152   * Returns the type of the attribute that the parser is in
153   * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute.
154   * The caller will typically invoke this method after determining
155   * that the parser is processing an attribute.
156   *
157   * <p>This is useful to determine which escaping to apply based
158   * on the type of value this attribute expects.
159   *
160   * @return type of the attribute
161   * @see HtmlParser.ATTR_TYPE
162   */
163  public ATTR_TYPE getAttributeType();
164
165  /**
166   * Returns {@code true} if and only if the parser is currently within
167   * an attribute value and that attribute value is quoted.
168   *
169   * @return {@code true} if and only if the attribute value is quoted
170   */
171  public boolean isAttributeQuoted();
172
173
174  /**
175   * Returns the name of the HTML tag if the parser is currently within one.
176   * Note that the name may be incomplete if the parser is currently still
177   * parsing the name. Returns an empty {@code String} if the parser is not
178   * in a tag as determined by {@code getCurrentExternalState}.
179   *
180   * @return the name of the HTML tag or an empty {@code String} if we are
181   *         not within an HTML tag
182   */
183  public String getTag();
184
185  /**
186   * Returns the name of the HTML attribute the parser is currently processing.
187   * If the parser is still parsing the name, then the returned name
188   * may be incomplete. Returns an empty {@code String} if the parser is not
189   * in an attribute as determined by {@code getCurrentExternalState}.
190   *
191   * @return the name of the HTML attribute or an empty {@code String}
192   *         if we are not within an HTML attribute
193   */
194  public String getAttribute();
195
196  /**
197   * Returns the value of an HTML attribute if the parser is currently
198   * within one. If the parser is currently parsing the value, the returned
199   * value may be incomplete. The caller will typically first determine
200   * that the parser is processing a value by calling
201   * {@code getCurrentExternalState}.
202   *
203   * @return the value, could be an empty {@code String} if the parser is not
204   *         in an HTML attribute value
205   */
206  public String getValue();
207
208  /**
209   * Returns the current position of the parser within the HTML attribute
210   * value, zero being the position of the first character in the value.
211   * The caller will typically first determine that the parser is
212   * processing a value by calling {@link #getState()}.
213   *
214   * @return the index or zero if the parser is not processing a value
215   */
216  public int getValueIndex();
217
218  /**
219   * Returns {@code true} if and only if the current position of the parser is
220   * at the start of a URL HTML attribute value. This is the case when the
221   * following three conditions are all met:
222   * <p>
223   * <ol>
224   * <li>The parser is in an HTML attribute value.
225   * <li>The HTML attribute expects a URL, as determined by
226   *     {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}.
227   * <li>The parser has not yet seen any characters from that URL.
228   * </ol>
229   *
230   * <p> This method may be used by an Html Sanitizer or an Auto-Escape system
231   * to determine whether to validate the URL for well-formedness and validate
232   * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe.
233   * In particular, it is recommended to use this method instead of
234   * checking that {@link #getValueIndex()} is {@code 0} to support attribute
235   * types where the URL does not start at index zero, such as the
236   * {@code content} attribute of the {@code meta} HTML tag.
237   *
238   * @return {@code true} if and only if the parser is at the start of the URL
239   */
240  public boolean isUrlStart();
241
242  /**
243   * Resets the state of the parser, allowing for reuse of the
244   * {@code HtmlParser} object.
245   *
246   * <p>See the {@link HtmlParser.Mode} enum for information on all
247   * the valid modes.
248   *
249   * @param mode is an enum representing the high-level state of the parser
250   */
251  public void resetMode(HtmlParser.Mode mode);
252
253  /**
254   * A specialized directive to tell the parser there is some content
255   * that will be inserted here but that it will not get to parse. Used
256   * by the template system that may not be able to give some content
257   * to the parser but wants it to know there typically will be content
258   * inserted at that point. This is a hint used in corner cases within
259   * parsing of HTML attribute names and values where content we do not
260   * get to see could affect our parsing and alter our current state.
261   *
262   * <p>Returns {@code false} if and only if the parser encountered
263   * a fatal error which prevents it from continuing further parsing.
264   *
265   * <p>Note: The return value is different from the C++ Parser which
266   * always returns {@code true} but in my opinion makes more sense.
267   *
268   * @throws ParseException if an unrecoverable error occurred during parsing
269   */
270  public void insertText() throws ParseException;
271
272  /**
273   * Returns the state the Javascript parser is in.
274   *
275   * <p>See {@link JavascriptParser} for more information on the valid
276   * external states. The caller will typically first determine that the
277   * parser is processing Javascript and then invoke this method to
278   * obtain more fine-grained state information.
279   *
280   * @return external state of the javascript parser
281   */
282  public ExternalState getJavascriptState();
283}
284