1/* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.streamhtmlparser.util; 18 19import com.google.common.collect.ImmutableSortedSet; 20 21import java.util.Set; 22import java.util.regex.Pattern; 23import java.util.regex.Matcher; 24 25/** 26 * Utility functions for HTML and Javascript that are most likely 27 * not interesting to users outside this package. 28 * 29 * <p>The <code>HtmlParser</code> will be open-sourced hence we took the 30 * decision to keep these utilities in this package as well as not to 31 * leverage others that may exist in the <code>google3</code> code base. 32 * 33 * <p>The functionality exposed is designed to be 100% compatible with 34 * the corresponding logic in the C-version of the HtmlParser as such 35 * we are particularly concerned with cross-language compatibility. 36 * 37 * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used 38 * interchangeably unless otherwise noted. 39 */ 40public final class HtmlUtils { 41 42 /** 43 * static utility class 44 */ 45 private HtmlUtils() { 46 } // COV_NF_LINE 47 48 /** 49 * Indicates the type of content contained in the {@code content} HTML 50 * attribute of the {@code meta} HTML tag. Used by 51 * {@link HtmlUtils#parseContentAttributeForUrl(String)}. 52 * <p>The values are: 53 * <ul> 54 * <li>{@code NONE} if it does not contain a URL in the expected format. 55 * <li>{@code URL_START} if it contains a URL but hasn't seen any of 56 * its contents. 57 * <li>{@code URL} if it contains a URL and has seen at least some of 58 * its contents. 59 * </ul> 60 */ 61 public enum META_REDIRECT_TYPE { 62 NONE, 63 URL_START, 64 URL 65 } 66 67 /** 68 * A regular expression matching the format of a {@code content} attribute 69 * that contains a URL. Used by {@link #parseContentAttributeForUrl}. 70 */ 71 private static final String META_REDIRECT_REGEX = 72 "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?"; 73 74 // Safe for use by concurrent threads so we compile once. 75 private static final Pattern META_REDIRECT_PATTERN = 76 Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE); 77 78 /** 79 * Set of keywords that can precede a regular expression literal. Taken from: 80 * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html"> 81 * Language Syntax</a> 82 * 83 * <p>The token {@code void} was added to the list. Several keywords are 84 * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic 85 * simple we do not differentiate on the version and bundle them all together. 86 */ 87 private static final Set<String> REGEXP_TOKEN_PREFIXS = 88 ImmutableSortedSet.of( 89 "abstract", 90 "break", 91 "case", 92 "catch", 93 "class", 94 "const", 95 "continue", 96 "debugger", 97 "default", 98 "delete", 99 "do", 100 "else", 101 "enum", 102 "eval", 103 "export", 104 "extends", 105 "field", 106 "final", 107 "finally", 108 "for", 109 "function", 110 "goto", 111 "if", 112 "implements", 113 "import", 114 "in", 115 "instanceof", 116 "native", 117 "new", 118 "package", 119 "private", 120 "protected", 121 "public", 122 "return", 123 "static", 124 "switch", 125 "synchronized", 126 "throw", 127 "throws", 128 "transient", 129 "try", 130 "typeof", 131 "var", 132 "void", 133 "volatile", 134 "while", 135 "with"); 136 137 /** 138 * Set of all HTML attributes which expect a URI (as the value). 139 * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a> 140 */ 141 private static final Set<String> ATTRIBUTE_EXPECTS_URI = 142 ImmutableSortedSet.of( 143 "action", 144 "archive", 145 "background", 146 "cite", 147 "classid", 148 "codebase", 149 "data", 150 "dynsrc", 151 "href", 152 "longdesc", 153 "src", 154 "usemap"); 155 156 /** 157 * Set of {@code Character}s considered whitespace in Javascript. 158 * See {@link #isJavascriptWhitespace(char)} 159 */ 160 private static final Set<Character> JAVASCRIPT_WHITESPACE = 161 ImmutableSortedSet.of( 162 '\u0009', /* Tab \t */ 163 '\n', /* Line-Feed 0x0A */ 164 '\u000B', /* Vertical Tab 0x0B */ 165 '\u000C', /* Form Feed \f */ 166 '\r', /* Carriage Return 0x0D */ 167 ' ', /* Space 0x20 */ 168 '\u00A0', /* Non-breaking space 0xA0 */ 169 '\u2028', /* Line separator */ 170 '\u2029'); /* Paragraph separator */ 171 172 /** 173 * Set of {@code Character}s considered whitespace in HTML. 174 * See {@link #isHtmlSpace(char)} 175 */ 176 private static final Set<Character> HTML_WHITESPACE = 177 ImmutableSortedSet.of( 178 ' ', 179 '\t', 180 '\n', 181 '\r', 182 '\u200B'); 183 184 185 /** 186 * Determines if the HTML attribute specified expects javascript 187 * for its value. Such is the case for example with the {@code onclick} 188 * attribute. 189 * 190 * <p>Currently returns {@code true} for any attribute name that starts 191 * with "on" which is not exactly correct but we trust a developer to 192 * not use non-spec compliant attribute names (e.g. onbogus). 193 * 194 * @param attribute the name of an HTML attribute 195 * @return {@code false} if the input is null or is not an attribute 196 * that expects javascript code; {@code true} 197 */ 198 public static boolean isAttributeJavascript(String attribute) { 199 return ((attribute != null) && attribute.startsWith("on")); 200 } 201 202 /** 203 * Determines if the HTML attribute specified expects a {@code style} 204 * for its value. Currently this is only true for the {@code style} 205 * HTML attribute. 206 * 207 * @param attribute the name of an HTML attribute 208 * @return {@code true} iff the attribute name is one that expects a 209 * style for a value; otherwise {@code false} 210 */ 211 public static boolean isAttributeStyle(String attribute) { 212 return "style".equals(attribute); 213 } 214 215 /** 216 * Determines if the HTML attribute specified expects a {@code URI} 217 * for its value. For example, both {@code href} and {@code src} 218 * expect a {@code URI} but {@code style} does not. Returns 219 * {@code false} if the attribute given was {@code null}. 220 * 221 * @param attribute the name of an HTML attribute 222 * @return {@code true} if the attribute name is one that expects 223 * a URI for a value; otherwise {@code null} 224 * 225 * @see #ATTRIBUTE_EXPECTS_URI 226 */ 227 public static boolean isAttributeUri(String attribute) { 228 return ATTRIBUTE_EXPECTS_URI.contains(attribute); 229 } 230 231 /** 232 * Determines if the specified character is an HTML whitespace character. 233 * A character is an HTML whitespace character if and only if it is one 234 * of the characters below. 235 * <ul> 236 * <li>A <code>Space</code> character 237 * <li>A <code>Tab</code> character 238 * <li>A <code>Line feed</code> character 239 * <li>A <code>Carriage Return</code> character 240 * <li>A <code>Zero-Width Space</code> character 241 * </ul> 242 * 243 * Note: The list includes the zero-width space (<code>&#x200B;</code>) 244 * which is not included in the C version. 245 * 246 * @param chr the {@code char} to check 247 * @return {@code true} if the character is an HTML whitespace character 248 * 249 * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a> 250 */ 251 public static boolean isHtmlSpace(char chr) { 252 return HTML_WHITESPACE.contains(chr); 253 } 254 255 /** 256 * Determines if the specified character is an ECMAScript whitespace or line 257 * terminator character. A character is a whitespace or line terminator if 258 * and only if it is one of the characters below: 259 * <ul> 260 * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>, 261 * <code>Form Feed</code>, <code>Space</code>, 262 * <code>No-break space</code>) 263 * <li>A line terminator character (<code>Line Feed</code>, 264 * <code>Carriage Return</code>, <code>Line separator</code>, 265 * <code>Paragraph Separator</code>). 266 * </ul> 267 * 268 * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in 269 * particular, this list is quite different from that in 270 * <code>Character.isWhitespace</code>. 271 * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf"> 272 * ECMAScript Language Specification</a> 273 * 274 * @param chr the {@code char} to check 275 * @return {@code true} or {@code false} 276 * 277 */ 278 public static boolean isJavascriptWhitespace(char chr) { 279 return JAVASCRIPT_WHITESPACE.contains(chr); 280 } 281 282 /** 283 * Determines if the specified character is a valid character in an 284 * ECMAScript identifier. This determination is currently not exact, 285 * in particular: 286 * <ul> 287 * <li>It does not accept Unicode letters, only ASCII ones. 288 * <li>It does not distinguish between the first character of an identifier 289 * (which cannot contain numbers) and subsequent characters. 290 * </li> 291 * </ul> 292 * 293 * We are considering leveraging <code>Character.isJavaIdentifierStart</code> 294 * and <code>Character.isJavaIdentifierPart</code> given that Java 295 * and Javascript follow similar identifier naming rules but we lose 296 * compatibility with the C-version. 297 * 298 * @param chr {@code char} to check 299 * @return {@code true} if the {@code chr} is a Javascript whitespace 300 * character; otherwise {@code false} 301 */ 302 public static boolean isJavascriptIdentifier(char chr) { 303 return ((chr >= 'a' && chr <= 'z') 304 || (chr >= 'A' && chr <= 'Z') 305 || (chr >= '0' && chr <= '9') 306 || chr == '_' || chr == '$'); 307 } 308 309 /** 310 * Determines if the input token provided is a valid token prefix to a 311 * javascript regular expression. The token argument is compared against 312 * a {@code Set} of identifiers that can precede a regular expression in the 313 * javascript grammar, and returns {@code true} if the provided 314 * {@code String} is in that {@code Set}. 315 * 316 * @param input the {@code String} token to check 317 * @return {@code true} iff the token is a valid prefix of a regexp 318 */ 319 public static boolean isJavascriptRegexpPrefix(String input) { 320 return REGEXP_TOKEN_PREFIXS.contains(input); 321 } 322 323 /** 324 * Encodes the specified character using Ascii for convenient insertion into 325 * a single-quote enclosed {@code String}. Printable characters 326 * are returned as-is. Carriage Return, Line Feed, Horizontal Tab, 327 * back-slash and single quote are all backslash-escaped. All other characters 328 * are returned hex-encoded. 329 * 330 * @param chr {@code char} to encode 331 * @return an Ascii-friendly encoding of the given {@code char} 332 */ 333 public static String encodeCharForAscii(char chr) { 334 if (chr == '\'') { 335 return "\\'"; 336 } else if (chr == '\\') { 337 return "\\\\"; 338 } else if (chr >= 32 && chr <= 126) { 339 return String.format("%c", chr); 340 } else if (chr == '\n') { 341 return "\\n"; 342 } else if (chr == '\r') { 343 return "\\r"; 344 } else if (chr == '\t') { 345 return "\\t"; 346 } else { 347 // Cannot apply a precision specifier for integral types. Specifying 348 // 0-padded hex-encoding with minimum width of two. 349 return String.format("\\u%04x", (int)chr); 350 } 351 } 352 353 /** 354 * Parses the given {@code String} to determine if it contains a URL in the 355 * format followed by the {@code content} attribute of the {@code meta} 356 * HTML tag. 357 * 358 * <p>This function expects to receive the value of the {@code content} HTML 359 * attribute. This attribute takes on different meanings depending on the 360 * value of the {@code http-equiv} HTML attribute of the same {@code meta} 361 * tag. Since we may not have access to the {@code http-equiv} attribute, 362 * we instead rely on parsing the given value to determine if it contains 363 * a URL. 364 * 365 * The specification of the {@code meta} HTML tag can be found in: 366 * http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh 367 * 368 * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the 369 * value contains a URL and whether we are at the start of the URL or past 370 * the start. We are at the start of the URL if and only if one of the two 371 * conditions below is true: 372 * <ul> 373 * <li>The given input does not contain any characters from the URL proper. 374 * Example "5; URL=". 375 * <li>The given input only contains the optional leading single or double 376 * quote leading the URL. Example "5; URL='". 377 * </li> 378 * </ul> 379 * 380 * <p>Examples: 381 * <ul> 382 * <li> Example of a complete {@code meta} tag where the {@code content} 383 * attribute contains a URL [we are not at the start of the URL]: 384 * <pre> 385 * <meta http-equiv="refresh" content="5; URL=http://www.google.com"> 386 * </pre> 387 * <li> Example of a complete {@code meta} tag where the {@code content} 388 * attribute contains a URL [we are at the start of the URL]: 389 * <pre> 390 * <meta http-equiv="refresh" content="5; URL="> 391 * </pre> 392 * <li>Example of a complete {@code meta} tag where the {@code content} 393 * attribute does not contain a URL: 394 * <pre> 395 * <meta http-equiv="content-type" content="text/html"> 396 * </pre> 397 * </ul> 398 * 399 * @param value {@code String} to parse 400 * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence 401 * of a URL in the given value 402 */ 403 public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) { 404 if (value == null) 405 return META_REDIRECT_TYPE.NONE; 406 407 Matcher matcher = META_REDIRECT_PATTERN.matcher(value); 408 if (!matcher.find()) 409 return META_REDIRECT_TYPE.NONE; 410 411 // We have more content. 412 if (value.length() > matcher.end()) 413 return META_REDIRECT_TYPE.URL; 414 415 return META_REDIRECT_TYPE.URL_START; 416 } 417} 418