1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.util;
18
19import com.google.common.collect.ImmutableSortedSet;
20
21import java.util.Set;
22import java.util.regex.Pattern;
23import java.util.regex.Matcher;
24
25/**
26 * Utility functions for HTML and Javascript that are most likely
27 * not interesting to users outside this package.
28 *
29 * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
30 * decision to keep these utilities in this package as well as not to
31 * leverage others that may exist in the <code>google3</code> code base.
32 *
33 * <p>The functionality exposed is designed to be 100% compatible with
34 * the corresponding logic in the C-version of the HtmlParser as such
35 * we are particularly concerned with cross-language compatibility.
36 *
37 * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
38 * interchangeably unless otherwise noted.
39 */
40public final class HtmlUtils {
41
42  /**
43   * static utility class
44   */
45  private HtmlUtils() {
46  }  // COV_NF_LINE
47
48  /**
49   * Indicates the type of content contained in the {@code content} HTML
50   * attribute of the {@code meta} HTML tag. Used by
51   * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
52   * <p>The values are:
53   * <ul>
54   * <li>{@code NONE} if it does not contain a URL in the expected format.
55   * <li>{@code URL_START} if it contains a URL but hasn't seen any of
56   * its contents.
57   * <li>{@code URL} if it contains a URL and has seen at least some of
58   * its contents.
59   * </ul>
60   */
61  public enum META_REDIRECT_TYPE {
62    NONE,
63    URL_START,
64    URL
65  }
66
67  /**
68   * A regular expression matching the format of a {@code content} attribute
69   * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
70   */
71  private static final String META_REDIRECT_REGEX =
72      "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";
73
74  // Safe for use by concurrent threads so we compile once.
75  private static final Pattern META_REDIRECT_PATTERN =
76      Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);
77
78  /**
79   * Set of keywords that can precede a regular expression literal. Taken from:
80   * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
81   * Language Syntax</a>
82   *
83   * <p>The token {@code void} was added to the list. Several keywords are
84   * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
85   * simple we do not differentiate on the version and bundle them all together.
86   */
87  private static final Set<String> REGEXP_TOKEN_PREFIXS =
88      ImmutableSortedSet.of(
89          "abstract",
90          "break",
91          "case",
92          "catch",
93          "class",
94          "const",
95          "continue",
96          "debugger",
97          "default",
98          "delete",
99          "do",
100          "else",
101          "enum",
102          "eval",
103          "export",
104          "extends",
105          "field",
106          "final",
107          "finally",
108          "for",
109          "function",
110          "goto",
111          "if",
112          "implements",
113          "import",
114          "in",
115          "instanceof",
116          "native",
117          "new",
118          "package",
119          "private",
120          "protected",
121          "public",
122          "return",
123          "static",
124          "switch",
125          "synchronized",
126          "throw",
127          "throws",
128          "transient",
129          "try",
130          "typeof",
131          "var",
132          "void",
133          "volatile",
134          "while",
135          "with");
136
137  /**
138   * Set of all HTML attributes which expect a URI (as the value).
139   * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
140   */
141  private static final Set<String> ATTRIBUTE_EXPECTS_URI =
142      ImmutableSortedSet.of(
143          "action",
144          "archive",
145          "background",
146          "cite",
147          "classid",
148          "codebase",
149          "data",
150          "dynsrc",
151          "href",
152          "longdesc",
153          "src",
154          "usemap");
155
156  /**
157   * Set of {@code Character}s considered whitespace in Javascript.
158   * See {@link #isJavascriptWhitespace(char)}
159   */
160  private static final Set<Character> JAVASCRIPT_WHITESPACE =
161      ImmutableSortedSet.of(
162            '\u0009',         /* Tab \t */
163            '\n',             /* Line-Feed 0x0A */
164            '\u000B',         /* Vertical Tab 0x0B */
165            '\u000C',         /* Form Feed \f */
166            '\r',             /* Carriage Return 0x0D */
167            ' ',              /* Space 0x20 */
168            '\u00A0',         /* Non-breaking space 0xA0 */
169            '\u2028',         /* Line separator */
170            '\u2029');        /* Paragraph separator */
171
172  /**
173  * Set of {@code Character}s considered whitespace in HTML.
174  * See {@link #isHtmlSpace(char)}
175  */
176 private static final Set<Character> HTML_WHITESPACE =
177      ImmutableSortedSet.of(
178          ' ',
179          '\t',
180          '\n',
181          '\r',
182          '\u200B');
183
184
185  /**
186   * Determines if the HTML attribute specified expects javascript
187   * for its value. Such is the case for example with the {@code onclick}
188   * attribute.
189   *
190   * <p>Currently returns {@code true} for any attribute name that starts
191   * with "on" which is not exactly correct but we trust a developer to
192   * not use non-spec compliant attribute names (e.g. onbogus).
193   *
194   * @param attribute the name of an HTML attribute
195   * @return {@code false} if the input is null or is not an attribute
196   *         that expects javascript code; {@code true}
197   */
198  public static boolean isAttributeJavascript(String attribute) {
199    return ((attribute != null) && attribute.startsWith("on"));
200  }
201
202  /**
203   * Determines if the HTML attribute specified expects a {@code style}
204   * for its value. Currently this is only true for the {@code style}
205   * HTML attribute.
206   *
207   * @param attribute the name of an HTML attribute
208   * @return {@code true} iff the attribute name is one that expects a
209   *     style for a value; otherwise {@code false}
210   */
211  public static boolean isAttributeStyle(String attribute) {
212    return "style".equals(attribute);
213  }
214
215  /**
216   * Determines if the HTML attribute specified expects a {@code URI}
217   * for its value. For example, both {@code href} and {@code src}
218   * expect a {@code URI} but {@code style} does not. Returns
219   * {@code false} if the attribute given was {@code null}.
220   *
221   * @param attribute the name of an HTML attribute
222   * @return {@code true} if the attribute name is one that expects
223   *         a URI for a value; otherwise {@code null}
224   *
225   * @see #ATTRIBUTE_EXPECTS_URI
226   */
227  public static boolean isAttributeUri(String attribute) {
228    return ATTRIBUTE_EXPECTS_URI.contains(attribute);
229  }
230
231  /**
232   * Determines if the specified character is an HTML whitespace character.
233   * A character is an HTML whitespace character if and only if it is one
234   * of the characters below.
235   * <ul>
236   * <li>A <code>Space</code> character
237   * <li>A <code>Tab</code> character
238   * <li>A <code>Line feed</code> character
239   * <li>A <code>Carriage Return</code> character
240   * <li>A <code>Zero-Width Space</code> character
241   * </ul>
242   *
243   * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
244   * which is not included in the C version.
245   *
246   * @param chr the {@code char} to check
247   * @return {@code true} if the character is an HTML whitespace character
248   *
249   * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
250   */
251  public static boolean isHtmlSpace(char chr) {
252    return HTML_WHITESPACE.contains(chr);
253  }
254
255  /**
256   * Determines if the specified character is an ECMAScript whitespace or line
257   * terminator character. A character is a whitespace or line terminator if
258   * and only if it is one of the characters below:
259   * <ul>
260   * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
261   *     <code>Form Feed</code>, <code>Space</code>,
262   *     <code>No-break space</code>)
263   * <li>A line terminator character (<code>Line Feed</code>,
264   *     <code>Carriage Return</code>, <code>Line separator</code>,
265   *     <code>Paragraph Separator</code>).
266   * </ul>
267   *
268   * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
269   * particular, this list is quite different from that in
270   * <code>Character.isWhitespace</code>.
271   * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
272   * ECMAScript Language Specification</a>
273   *
274   * @param chr the {@code char} to check
275   * @return {@code true} or {@code false}
276   *
277   */
278  public static boolean isJavascriptWhitespace(char chr) {
279    return JAVASCRIPT_WHITESPACE.contains(chr);
280  }
281
282  /**
283   * Determines if the specified character is a valid character in an
284   * ECMAScript identifier. This determination is currently not exact,
285   * in particular:
286   * <ul>
287   * <li>It does not accept Unicode letters, only ASCII ones.
288   * <li>It does not distinguish between the first character of an identifier
289   *     (which cannot contain numbers) and subsequent characters.
290   * </li>
291   * </ul>
292   *
293   * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
294   * and <code>Character.isJavaIdentifierPart</code> given that Java
295   * and Javascript follow similar identifier naming rules but we lose
296   * compatibility with the C-version.
297   *
298   * @param chr {@code char} to check
299   * @return {@code true} if the {@code chr} is a Javascript whitespace
300   *         character; otherwise {@code false}
301   */
302  public static boolean isJavascriptIdentifier(char chr) {
303    return ((chr >= 'a' && chr <= 'z')
304        || (chr >= 'A' && chr <= 'Z')
305        || (chr >= '0' && chr <= '9')
306        || chr == '_' || chr == '$');
307  }
308
309  /**
310   * Determines if the input token provided is a valid token prefix to a
311   * javascript regular expression.  The token argument is compared against
312   * a {@code Set} of identifiers that can precede a regular expression in the
313   * javascript grammar, and returns {@code true} if the provided
314   * {@code String} is in that {@code Set}.
315   *
316   * @param input the {@code String} token to check
317   * @return {@code true} iff the token is a valid prefix of a regexp
318   */
319  public static boolean isJavascriptRegexpPrefix(String input) {
320    return REGEXP_TOKEN_PREFIXS.contains(input);
321  }
322
323  /**
324   * Encodes the specified character using Ascii for convenient insertion into
325   * a single-quote enclosed {@code String}. Printable characters
326   * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
327   * back-slash and single quote are all backslash-escaped. All other characters
328   * are returned hex-encoded.
329   *
330   * @param chr {@code char} to encode
331   * @return an Ascii-friendly encoding of the given {@code char}
332   */
333  public static String encodeCharForAscii(char chr) {
334    if (chr == '\'') {
335      return "\\'";
336    } else if (chr == '\\') {
337      return "\\\\";
338    } else if (chr >= 32 && chr <= 126) {
339      return String.format("%c", chr);
340    } else if (chr == '\n') {
341      return "\\n";
342    } else if (chr == '\r') {
343      return "\\r";
344    } else if (chr == '\t') {
345      return "\\t";
346    } else {
347      // Cannot apply a precision specifier for integral types. Specifying
348      // 0-padded hex-encoding with minimum width of two.
349      return String.format("\\u%04x", (int)chr);
350    }
351  }
352
353  /**
354   * Parses the given {@code String} to determine if it contains a URL in the
355   * format followed by the {@code content} attribute of the {@code meta}
356   * HTML tag.
357   *
358   * <p>This function expects to receive the value of the {@code content} HTML
359   * attribute. This attribute takes on different meanings depending on the
360   * value of the {@code http-equiv} HTML attribute of the same {@code meta}
361   * tag. Since we may not have access to the {@code http-equiv} attribute,
362   * we instead rely on parsing the given value to determine if it contains
363   * a URL.
364   *
365   * The specification of the {@code meta} HTML tag can be found in:
366   *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
367   *
368   * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
369   * value contains a URL and whether we are at the start of the URL or past
370   * the start. We are at the start of the URL if and only if one of the two
371   * conditions below is true:
372   * <ul>
373   * <li>The given input does not contain any characters from the URL proper.
374   * Example "5; URL=".
375   * <li>The given input only contains the optional leading single or double
376   * quote leading the URL. Example "5; URL='".
377   * </li>
378   * </ul>
379   *
380   * <p>Examples:
381   * <ul>
382   * <li> Example of a complete {@code meta} tag where the {@code content}
383   * attribute contains a URL [we are not at the start of the URL]:
384   * <pre>
385   * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
386   * </pre>
387   * <li> Example of a complete {@code meta} tag where the {@code content}
388   * attribute contains a URL [we are at the start of the URL]:
389   * <pre>
390   * &lt;meta http-equiv="refresh" content="5; URL="&gt;
391   * </pre>
392   * <li>Example of a complete {@code meta} tag where the {@code content}
393   * attribute does not contain a URL:
394   * <pre>
395   * &lt;meta http-equiv="content-type" content="text/html"&gt;
396   * </pre>
397   * </ul>
398   *
399   * @param value {@code String} to parse
400   * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
401   * of a URL in the given value
402   */
403  public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
404    if (value == null)
405      return META_REDIRECT_TYPE.NONE;
406
407    Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
408    if (!matcher.find())
409      return META_REDIRECT_TYPE.NONE;
410
411    // We have more content.
412    if (value.length() > matcher.end())
413      return META_REDIRECT_TYPE.URL;
414
415    return META_REDIRECT_TYPE.URL_START;
416  }
417}
418