156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/*
256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc.
356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License");
556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License.
656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at
756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0
956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software
1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS,
1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and
1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License.
1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser.util;
1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.common.collect.ImmutableSortedSet;
2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.Set;
2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.regex.Pattern;
2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.regex.Matcher;
2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/**
2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Utility functions for HTML and Javascript that are most likely
2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * not interesting to users outside this package.
2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * decision to keep these utilities in this package as well as not to
3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * leverage others that may exist in the <code>google3</code> code base.
3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>The functionality exposed is designed to be 100% compatible with
3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * the corresponding logic in the C-version of the HtmlParser as such
3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * we are particularly concerned with cross-language compatibility.
3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * interchangeably unless otherwise noted.
3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic final class HtmlUtils {
4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * static utility class
4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private HtmlUtils() {
4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }  // COV_NF_LINE
4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Indicates the type of content contained in the {@code content} HTML
5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute of the {@code meta} HTML tag. Used by
5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>The values are:
5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code NONE} if it does not contain a URL in the expected format.
5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code URL_START} if it contains a URL but hasn't seen any of
5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * its contents.
5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>{@code URL} if it contains a URL and has seen at least some of
5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * its contents.
5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public enum META_REDIRECT_TYPE {
6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    NONE,
6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    URL_START,
6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    URL
6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * A regular expression matching the format of a {@code content} attribute
6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final String META_REDIRECT_REGEX =
7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";
7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // Safe for use by concurrent threads so we compile once.
7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Pattern META_REDIRECT_PATTERN =
7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);
7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Set of keywords that can precede a regular expression literal. Taken from:
8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Language Syntax</a>
8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>The token {@code void} was added to the list. Several keywords are
8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * simple we do not differentiate on the version and bundle them all together.
8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Set<String> REGEXP_TOKEN_PREFIXS =
8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      ImmutableSortedSet.of(
8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "abstract",
9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "break",
9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "case",
9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "catch",
9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "class",
9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "const",
9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "continue",
9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "debugger",
9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "default",
9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "delete",
9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "do",
10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "else",
10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "enum",
10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "eval",
10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "export",
10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "extends",
10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "field",
10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "final",
10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "finally",
10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "for",
10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "function",
11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "goto",
11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "if",
11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "implements",
11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "import",
11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "in",
11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "instanceof",
11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "native",
11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "new",
11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "package",
11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "private",
12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "protected",
12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "public",
12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "return",
12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "static",
12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "switch",
12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "synchronized",
12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "throw",
12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "throws",
12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "transient",
12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "try",
13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "typeof",
13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "var",
13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "void",
13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "volatile",
13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "while",
13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "with");
13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Set of all HTML attributes which expect a URI (as the value).
13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Set<String> ATTRIBUTE_EXPECTS_URI =
14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      ImmutableSortedSet.of(
14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "action",
14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "archive",
14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "background",
14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "cite",
14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "classid",
14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "codebase",
14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "data",
15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "dynsrc",
15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "href",
15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "longdesc",
15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "src",
15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          "usemap");
15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Set of {@code Character}s considered whitespace in Javascript.
15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * See {@link #isJavascriptWhitespace(char)}
15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Set<Character> JAVASCRIPT_WHITESPACE =
16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      ImmutableSortedSet.of(
16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u0009',         /* Tab \t */
16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\n',             /* Line-Feed 0x0A */
16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u000B',         /* Vertical Tab 0x0B */
16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u000C',         /* Form Feed \f */
16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\r',             /* Carriage Return 0x0D */
16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            ' ',              /* Space 0x20 */
16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u00A0',         /* Non-breaking space 0xA0 */
16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u2028',         /* Line separator */
17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            '\u2029');        /* Paragraph separator */
17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  * Set of {@code Character}s considered whitespace in HTML.
17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  * See {@link #isHtmlSpace(char)}
17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  */
17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final Set<Character> HTML_WHITESPACE =
17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      ImmutableSortedSet.of(
17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          ' ',
17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          '\t',
18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          '\n',
18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          '\r',
18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          '\u200B');
18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the HTML attribute specified expects javascript
18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * for its value. Such is the case for example with the {@code onclick}
18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute.
18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Currently returns {@code true} for any attribute name that starts
19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * with "on" which is not exactly correct but we trust a developer to
19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * not use non-spec compliant attribute names (e.g. onbogus).
19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param attribute the name of an HTML attribute
19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code false} if the input is null or is not an attribute
19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         that expects javascript code; {@code true}
19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isAttributeJavascript(String attribute) {
19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return ((attribute != null) && attribute.startsWith("on"));
20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the HTML attribute specified expects a {@code style}
20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * for its value. Currently this is only true for the {@code style}
20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * HTML attribute.
20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param attribute the name of an HTML attribute
20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} iff the attribute name is one that expects a
20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     style for a value; otherwise {@code false}
21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isAttributeStyle(String attribute) {
21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return "style".equals(attribute);
21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the HTML attribute specified expects a {@code URI}
21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * for its value. For example, both {@code href} and {@code src}
21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * expect a {@code URI} but {@code style} does not. Returns
21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code false} if the attribute given was {@code null}.
22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param attribute the name of an HTML attribute
22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if the attribute name is one that expects
22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         a URI for a value; otherwise {@code null}
22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @see #ATTRIBUTE_EXPECTS_URI
22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isAttributeUri(String attribute) {
22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return ATTRIBUTE_EXPECTS_URI.contains(attribute);
22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the specified character is an HTML whitespace character.
23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * A character is an HTML whitespace character if and only if it is one
23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * of the characters below.
23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A <code>Space</code> character
23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A <code>Tab</code> character
23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A <code>Line feed</code> character
23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A <code>Carriage Return</code> character
24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A <code>Zero-Width Space</code> character
24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * which is not included in the C version.
24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param chr the {@code char} to check
24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if the character is an HTML whitespace character
24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isHtmlSpace(char chr) {
25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return HTML_WHITESPACE.contains(chr);
25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the specified character is an ECMAScript whitespace or line
25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * terminator character. A character is a whitespace or line terminator if
25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * and only if it is one of the characters below:
25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <code>Form Feed</code>, <code>Space</code>,
26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <code>No-break space</code>)
26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>A line terminator character (<code>Line Feed</code>,
26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <code>Carriage Return</code>, <code>Line separator</code>,
26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     <code>Paragraph Separator</code>).
26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * particular, this list is quite different from that in
27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <code>Character.isWhitespace</code>.
27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * ECMAScript Language Specification</a>
27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param chr the {@code char} to check
27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} or {@code false}
27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isJavascriptWhitespace(char chr) {
27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return JAVASCRIPT_WHITESPACE.contains(chr);
28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the specified character is a valid character in an
28456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * ECMAScript identifier. This determination is currently not exact,
28556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * in particular:
28656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
28756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>It does not accept Unicode letters, only ASCII ones.
28856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>It does not distinguish between the first character of an identifier
28956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     (which cannot contain numbers) and subsequent characters.
29056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </li>
29156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
29256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
29356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
29456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * and <code>Character.isJavaIdentifierPart</code> given that Java
29556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * and Javascript follow similar identifier naming rules but we lose
29656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * compatibility with the C-version.
29756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
29856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param chr {@code char} to check
29956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} if the {@code chr} is a Javascript whitespace
30056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         character; otherwise {@code false}
30156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
30256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isJavascriptIdentifier(char chr) {
30356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return ((chr >= 'a' && chr <= 'z')
30456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (chr >= 'A' && chr <= 'Z')
30556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || (chr >= '0' && chr <= '9')
30656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        || chr == '_' || chr == '$');
30756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
30856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
30956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
31056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Determines if the input token provided is a valid token prefix to a
31156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * javascript regular expression.  The token argument is compared against
31256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a {@code Set} of identifiers that can precede a regular expression in the
31356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * javascript grammar, and returns {@code true} if the provided
31456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code String} is in that {@code Set}.
31556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
31656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param input the {@code String} token to check
31756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@code true} iff the token is a valid prefix of a regexp
31856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
31956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static boolean isJavascriptRegexpPrefix(String input) {
32056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return REGEXP_TOKEN_PREFIXS.contains(input);
32156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
32256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
32356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
32456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Encodes the specified character using Ascii for convenient insertion into
32556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a single-quote enclosed {@code String}. Printable characters
32656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
32756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * back-slash and single quote are all backslash-escaped. All other characters
32856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * are returned hex-encoded.
32956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
33056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param chr {@code char} to encode
33156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return an Ascii-friendly encoding of the given {@code char}
33256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
33356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static String encodeCharForAscii(char chr) {
33456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (chr == '\'') {
33556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return "\\'";
33656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (chr == '\\') {
33756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return "\\\\";
33856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (chr >= 32 && chr <= 126) {
33956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return String.format("%c", chr);
34056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (chr == '\n') {
34156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return "\\n";
34256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (chr == '\r') {
34356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return "\\r";
34456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else if (chr == '\t') {
34556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return "\\t";
34656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } else {
34756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      // Cannot apply a precision specifier for integral types. Specifying
34856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      // 0-padded hex-encoding with minimum width of two.
34956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return String.format("\\u%04x", (int)chr);
35056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
35156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
35256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
35356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
35456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Parses the given {@code String} to determine if it contains a URL in the
35556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * format followed by the {@code content} attribute of the {@code meta}
35656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * HTML tag.
35756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
35856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>This function expects to receive the value of the {@code content} HTML
35956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute. This attribute takes on different meanings depending on the
36056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value of the {@code http-equiv} HTML attribute of the same {@code meta}
36156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * tag. Since we may not have access to the {@code http-equiv} attribute,
36256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * we instead rely on parsing the given value to determine if it contains
36356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a URL.
36456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
36556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The specification of the {@code meta} HTML tag can be found in:
36656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
36756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
36856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
36956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value contains a URL and whether we are at the start of the URL or past
37056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * the start. We are at the start of the URL if and only if one of the two
37156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * conditions below is true:
37256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
37356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The given input does not contain any characters from the URL proper.
37456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Example "5; URL=".
37556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The given input only contains the optional leading single or double
37656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * quote leading the URL. Example "5; URL='".
37756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </li>
37856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
37956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
38056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>Examples:
38156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
38256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li> Example of a complete {@code meta} tag where the {@code content}
38356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute contains a URL [we are not at the start of the URL]:
38456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <pre>
38556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
38656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </pre>
38756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li> Example of a complete {@code meta} tag where the {@code content}
38856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute contains a URL [we are at the start of the URL]:
38956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <pre>
39056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * &lt;meta http-equiv="refresh" content="5; URL="&gt;
39156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </pre>
39256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>Example of a complete {@code meta} tag where the {@code content}
39356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * attribute does not contain a URL:
39456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <pre>
39556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * &lt;meta http-equiv="content-type" content="text/html"&gt;
39656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </pre>
39756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
39856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
39956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param value {@code String} to parse
40056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
40156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * of a URL in the given value
40256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
40356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
40456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (value == null)
40556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return META_REDIRECT_TYPE.NONE;
40656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
40756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
40856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (!matcher.find())
40956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return META_REDIRECT_TYPE.NONE;
41056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
41156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // We have more content.
41256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (value.length() > matcher.end())
41356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return META_REDIRECT_TYPE.URL;
41456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
41556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return META_REDIRECT_TYPE.URL_START;
41656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
41756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson}
418