HtmlEntities.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
1// Copyright (c) 2011, Mike Samuel 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions 6// are met: 7// 8// Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// Neither the name of the OWASP nor the names of its contributors may 14// be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27// POSSIBILITY OF SUCH DAMAGE. 28 29package org.owasp.html; 30 31import com.google.common.collect.ImmutableMap; 32 33class HtmlEntities { 34 35 public static long decodeEntityAt(String html, int offset, int limit) { 36 char ch = html.charAt(offset); 37 if ('&' != ch) { 38 return ((offset + 1L) << 32) | ch; 39 } 40 41 int entityLimit = Math.min(limit, offset + 10); 42 int end = -1; 43 int tail = -1; 44 if (entityLimit == limit) { 45 // Assume a broken entity that ends at the end until shown otherwise. 46 end = tail = entityLimit; 47 } 48 entityloop: 49 for (int i = offset + 1; i < entityLimit; ++i) { 50 switch (html.charAt(i)) { 51 case ';': 52 end = i; 53 tail = end + 1; 54 break entityloop; 55 case '#': 56 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 57 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 58 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 59 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 60 case 'Y': case 'Z': 61 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 62 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 63 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 64 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 65 case 'y': case 'z': 66 case '0': case '1': case '2': case '3': case '4': case '5': 67 case '6': case '7': case '8': case '9': 68 break; 69 case '=': 70 return ((offset + 1L) << 32) | '&'; 71 default: // A possible broken entity. 72 end = i; 73 tail = i; 74 break entityloop; 75 } 76 } 77 if (end < 0 || offset + 2 >= end) { 78 return ((offset + 1L) << 32) | '&'; 79 } 80 // Now we know where the entity ends, and that there is at least one 81 // character in the entity name 82 char ch1 = html.charAt(offset + 1); 83 char ch2 = html.charAt(offset + 2); 84 int codepoint = -1; 85 if ('#' == ch1) { 86 // numeric entity 87 if ('x' == ch2 || 'X' == ch2) { 88 if (end == offset + 3) { // No digits 89 return ((offset + 1L) << 32) | '&'; 90 } 91 codepoint = 0; 92 // hex literal 93 digloop: 94 for (int i = offset + 3; i < end; ++i) { 95 char digit = html.charAt(i); 96 switch (digit & 0xfff8) { 97 case 0x30: case 0x38: // ASCII 48-57 are '0'-'9' 98 int decDig = digit & 0xf; 99 if (decDig < 10) { 100 codepoint = (codepoint << 4) | decDig; 101 } else { 102 codepoint = -1; 103 break digloop; 104 } 105 break; 106 // ASCII 65-70 and 97-102 are 'A'-'Z' && 'a'-'z' 107 case 0x40: case 0x60: 108 int hexDig = (digit & 0x7); 109 if (hexDig != 0 && hexDig < 7) { 110 codepoint = (codepoint << 4) | (hexDig + 9); 111 } else { 112 codepoint = -1; 113 break digloop; 114 } 115 break; 116 default: 117 codepoint = -1; 118 break digloop; 119 } 120 } 121 } else { 122 codepoint = 0; 123 // decimal literal 124 digloop: 125 for (int i = offset + 2; i < end; ++i) { 126 char digit = html.charAt(i); 127 switch (digit & 0xfff8) { 128 case 0x30: case 0x38: // ASCII 48-57 are '0'-'9' 129 int decDig = digit - '0'; 130 if (decDig < 10) { 131 codepoint = (codepoint * 10) + decDig; 132 } else { 133 codepoint = -1; 134 break digloop; 135 } 136 break; 137 default: 138 codepoint = -1; 139 break digloop; 140 } 141 } 142 } 143 } else { 144 Trie t = ENTITY_TRIE; 145 for (int i = offset + 1; i < end; ++i) { 146 char nameChar = html.charAt(i); 147 t = t.lookup(nameChar); 148 if (t == null) { break; } 149 } 150 if (t == null) { 151 t = ENTITY_TRIE; 152 for (int i = offset + 1; i < end; ++i) { 153 char nameChar = html.charAt(i); 154 if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; } 155 t = t.lookup(nameChar); 156 if (t == null) { break; } 157 } 158 } 159 if (t != null && t.isTerminal()) { 160 codepoint = t.getValue(); 161 } 162 } 163 if (codepoint < 0) { 164 return ((offset + 1L) << 32) | '&'; 165 } else { 166 return (((long) tail) << 32) | codepoint; 167 } 168 } 169 170 /** A possible entity name like "amp" or "gt". */ 171 public static boolean isEntityName(String name) { 172 Trie t = ENTITY_TRIE; 173 int n = name.length(); 174 175 // Treat AMP the same amp, but not Amp. 176 boolean isUcase = true; 177 for (int i = 0; i < n; ++i) { 178 char ch = name.charAt(i); 179 if (!('A' <= ch && ch <= 'Z')) { 180 isUcase = false; 181 break; 182 } 183 } 184 185 if (isUcase) { name = Strings.toLowerCase(name); } 186 187 for (int i = 0; i < n; ++i) { 188 t = t.lookup(name.charAt(i)); 189 if (t == null) { return false; } 190 } 191 return t.isTerminal(); 192 } 193 194 public static final Trie ENTITY_TRIE = new Trie( 195 ImmutableMap.<String, Integer>builder() 196 // C0 Controls and Basic Latin 197 .put("quot", Integer.valueOf('"')) 198 .put("amp", Integer.valueOf('&')) 199 .put("lt", Integer.valueOf('<')) 200 .put("gt", Integer.valueOf('>')) 201 202 // XML 1.0 203 .put("apos", Integer.valueOf('\'')) 204 205 // HTML4 entities 206 .put("nbsp", Integer.valueOf('\u00a0')) 207 .put("iexcl", Integer.valueOf('\u00a1')) 208 .put("cent", Integer.valueOf('\u00a2')) 209 .put("pound", Integer.valueOf('\u00a3')) 210 .put("curren", Integer.valueOf('\u00a4')) 211 .put("yen", Integer.valueOf('\u00a5')) 212 .put("brvbar", Integer.valueOf('\u00a6')) 213 .put("sect", Integer.valueOf('\u00a7')) 214 .put("uml", Integer.valueOf('\u00a8')) 215 .put("copy", Integer.valueOf('\u00a9')) 216 .put("ordf", Integer.valueOf('\u00aa')) 217 .put("laquo", Integer.valueOf('\u00ab')) 218 .put("not", Integer.valueOf('\u00ac')) 219 .put("shy", Integer.valueOf('\u00ad')) 220 .put("reg", Integer.valueOf('\u00ae')) 221 .put("macr", Integer.valueOf('\u00af')) 222 .put("deg", Integer.valueOf('\u00b0')) 223 .put("plusmn", Integer.valueOf('\u00b1')) 224 .put("sup2", Integer.valueOf('\u00b2')) 225 .put("sup3", Integer.valueOf('\u00b3')) 226 .put("acute", Integer.valueOf('\u00b4')) 227 .put("micro", Integer.valueOf('\u00b5')) 228 .put("para", Integer.valueOf('\u00b6')) 229 .put("middot", Integer.valueOf('\u00b7')) 230 .put("cedil", Integer.valueOf('\u00b8')) 231 .put("sup1", Integer.valueOf('\u00b9')) 232 .put("ordm", Integer.valueOf('\u00ba')) 233 .put("raquo", Integer.valueOf('\u00bb')) 234 .put("frac14", Integer.valueOf('\u00bc')) 235 .put("frac12", Integer.valueOf('\u00bd')) 236 .put("frac34", Integer.valueOf('\u00be')) 237 .put("iquest", Integer.valueOf('\u00bf')) 238 .put("Agrave", Integer.valueOf('\u00c0')) 239 .put("Aacute", Integer.valueOf('\u00c1')) 240 .put("Acirc", Integer.valueOf('\u00c2')) 241 .put("Atilde", Integer.valueOf('\u00c3')) 242 .put("Auml", Integer.valueOf('\u00c4')) 243 .put("Aring", Integer.valueOf('\u00c5')) 244 .put("AElig", Integer.valueOf('\u00c6')) 245 .put("Ccedil", Integer.valueOf('\u00c7')) 246 .put("Egrave", Integer.valueOf('\u00c8')) 247 .put("Eacute", Integer.valueOf('\u00c9')) 248 .put("Ecirc", Integer.valueOf('\u00ca')) 249 .put("Euml", Integer.valueOf('\u00cb')) 250 .put("Igrave", Integer.valueOf('\u00cc')) 251 .put("Iacute", Integer.valueOf('\u00cd')) 252 .put("Icirc", Integer.valueOf('\u00ce')) 253 .put("Iuml", Integer.valueOf('\u00cf')) 254 .put("ETH", Integer.valueOf('\u00d0')) 255 .put("Ntilde", Integer.valueOf('\u00d1')) 256 .put("Ograve", Integer.valueOf('\u00d2')) 257 .put("Oacute", Integer.valueOf('\u00d3')) 258 .put("Ocirc", Integer.valueOf('\u00d4')) 259 .put("Otilde", Integer.valueOf('\u00d5')) 260 .put("Ouml", Integer.valueOf('\u00d6')) 261 .put("times", Integer.valueOf('\u00d7')) 262 .put("Oslash", Integer.valueOf('\u00d8')) 263 .put("Ugrave", Integer.valueOf('\u00d9')) 264 .put("Uacute", Integer.valueOf('\u00da')) 265 .put("Ucirc", Integer.valueOf('\u00db')) 266 .put("Uuml", Integer.valueOf('\u00dc')) 267 .put("Yacute", Integer.valueOf('\u00dd')) 268 .put("THORN", Integer.valueOf('\u00de')) 269 .put("szlig", Integer.valueOf('\u00df')) 270 .put("agrave", Integer.valueOf('\u00e0')) 271 .put("aacute", Integer.valueOf('\u00e1')) 272 .put("acirc", Integer.valueOf('\u00e2')) 273 .put("atilde", Integer.valueOf('\u00e3')) 274 .put("auml", Integer.valueOf('\u00e4')) 275 .put("aring", Integer.valueOf('\u00e5')) 276 .put("aelig", Integer.valueOf('\u00e6')) 277 .put("ccedil", Integer.valueOf('\u00e7')) 278 .put("egrave", Integer.valueOf('\u00e8')) 279 .put("eacute", Integer.valueOf('\u00e9')) 280 .put("ecirc", Integer.valueOf('\u00ea')) 281 .put("euml", Integer.valueOf('\u00eb')) 282 .put("igrave", Integer.valueOf('\u00ec')) 283 .put("iacute", Integer.valueOf('\u00ed')) 284 .put("icirc", Integer.valueOf('\u00ee')) 285 .put("iuml", Integer.valueOf('\u00ef')) 286 .put("eth", Integer.valueOf('\u00f0')) 287 .put("ntilde", Integer.valueOf('\u00f1')) 288 .put("ograve", Integer.valueOf('\u00f2')) 289 .put("oacute", Integer.valueOf('\u00f3')) 290 .put("ocirc", Integer.valueOf('\u00f4')) 291 .put("otilde", Integer.valueOf('\u00f5')) 292 .put("ouml", Integer.valueOf('\u00f6')) 293 .put("divide", Integer.valueOf('\u00f7')) 294 .put("oslash", Integer.valueOf('\u00f8')) 295 .put("ugrave", Integer.valueOf('\u00f9')) 296 .put("uacute", Integer.valueOf('\u00fa')) 297 .put("ucirc", Integer.valueOf('\u00fb')) 298 .put("uuml", Integer.valueOf('\u00fc')) 299 .put("yacute", Integer.valueOf('\u00fd')) 300 .put("thorn", Integer.valueOf('\u00fe')) 301 .put("yuml", Integer.valueOf('\u00ff')) 302 303 // Latin Extended-B 304 .put("fnof", Integer.valueOf('\u0192')) 305 306 // Greek 307 .put("Alpha", Integer.valueOf('\u0391')) 308 .put("Beta", Integer.valueOf('\u0392')) 309 .put("Gamma", Integer.valueOf('\u0393')) 310 .put("Delta", Integer.valueOf('\u0394')) 311 .put("Epsilon", Integer.valueOf('\u0395')) 312 .put("Zeta", Integer.valueOf('\u0396')) 313 .put("Eta", Integer.valueOf('\u0397')) 314 .put("Theta", Integer.valueOf('\u0398')) 315 .put("Iota", Integer.valueOf('\u0399')) 316 .put("Kappa", Integer.valueOf('\u039a')) 317 .put("Lambda", Integer.valueOf('\u039b')) 318 .put("Mu", Integer.valueOf('\u039c')) 319 .put("Nu", Integer.valueOf('\u039d')) 320 .put("Xi", Integer.valueOf('\u039e')) 321 .put("Omicron", Integer.valueOf('\u039f')) 322 .put("Pi", Integer.valueOf('\u03a0')) 323 .put("Rho", Integer.valueOf('\u03a1')) 324 .put("Sigma", Integer.valueOf('\u03a3')) 325 .put("Tau", Integer.valueOf('\u03a4')) 326 .put("Upsilon", Integer.valueOf('\u03a5')) 327 .put("Phi", Integer.valueOf('\u03a6')) 328 .put("Chi", Integer.valueOf('\u03a7')) 329 .put("Psi", Integer.valueOf('\u03a8')) 330 .put("Omega", Integer.valueOf('\u03a9')) 331 332 .put("alpha", Integer.valueOf('\u03b1')) 333 .put("beta", Integer.valueOf('\u03b2')) 334 .put("gamma", Integer.valueOf('\u03b3')) 335 .put("delta", Integer.valueOf('\u03b4')) 336 .put("epsilon", Integer.valueOf('\u03b5')) 337 .put("zeta", Integer.valueOf('\u03b6')) 338 .put("eta", Integer.valueOf('\u03b7')) 339 .put("theta", Integer.valueOf('\u03b8')) 340 .put("iota", Integer.valueOf('\u03b9')) 341 .put("kappa", Integer.valueOf('\u03ba')) 342 .put("lambda", Integer.valueOf('\u03bb')) 343 .put("mu", Integer.valueOf('\u03bc')) 344 .put("nu", Integer.valueOf('\u03bd')) 345 .put("xi", Integer.valueOf('\u03be')) 346 .put("omicron", Integer.valueOf('\u03bf')) 347 .put("pi", Integer.valueOf('\u03c0')) 348 .put("rho", Integer.valueOf('\u03c1')) 349 .put("sigmaf", Integer.valueOf('\u03c2')) 350 .put("sigma", Integer.valueOf('\u03c3')) 351 .put("tau", Integer.valueOf('\u03c4')) 352 .put("upsilon", Integer.valueOf('\u03c5')) 353 .put("phi", Integer.valueOf('\u03c6')) 354 .put("chi", Integer.valueOf('\u03c7')) 355 .put("psi", Integer.valueOf('\u03c8')) 356 .put("omega", Integer.valueOf('\u03c9')) 357 .put("thetasym", Integer.valueOf('\u03d1')) 358 .put("upsih", Integer.valueOf('\u03d2')) 359 .put("piv", Integer.valueOf('\u03d6')) 360 361 // General Punctuation 362 .put("bull", Integer.valueOf('\u2022')) 363 .put("hellip", Integer.valueOf('\u2026')) 364 .put("prime", Integer.valueOf('\u2032')) 365 .put("Prime", Integer.valueOf('\u2033')) 366 .put("oline", Integer.valueOf('\u203e')) 367 .put("frasl", Integer.valueOf('\u2044')) 368 369 // Letterlike Symbols 370 .put("weierp", Integer.valueOf('\u2118')) 371 .put("image", Integer.valueOf('\u2111')) 372 .put("real", Integer.valueOf('\u211c')) 373 .put("trade", Integer.valueOf('\u2122')) 374 .put("alefsym", Integer.valueOf('\u2135')) 375 376 // Arrows 377 .put("larr", Integer.valueOf('\u2190')) 378 .put("uarr", Integer.valueOf('\u2191')) 379 .put("rarr", Integer.valueOf('\u2192')) 380 .put("darr", Integer.valueOf('\u2193')) 381 .put("harr", Integer.valueOf('\u2194')) 382 .put("crarr", Integer.valueOf('\u21b5')) 383 .put("lArr", Integer.valueOf('\u21d0')) 384 .put("uArr", Integer.valueOf('\u21d1')) 385 .put("rArr", Integer.valueOf('\u21d2')) 386 .put("dArr", Integer.valueOf('\u21d3')) 387 .put("hArr", Integer.valueOf('\u21d4')) 388 389 // Mathematical Operators 390 .put("forall", Integer.valueOf('\u2200')) 391 .put("part", Integer.valueOf('\u2202')) 392 .put("exist", Integer.valueOf('\u2203')) 393 .put("empty", Integer.valueOf('\u2205')) 394 .put("nabla", Integer.valueOf('\u2207')) 395 .put("isin", Integer.valueOf('\u2208')) 396 .put("notin", Integer.valueOf('\u2209')) 397 .put("ni", Integer.valueOf('\u220b')) 398 .put("prod", Integer.valueOf('\u220f')) 399 .put("sum", Integer.valueOf('\u2211')) 400 .put("minus", Integer.valueOf('\u2212')) 401 .put("lowast", Integer.valueOf('\u2217')) 402 .put("radic", Integer.valueOf('\u221a')) 403 .put("prop", Integer.valueOf('\u221d')) 404 .put("infin", Integer.valueOf('\u221e')) 405 .put("ang", Integer.valueOf('\u2220')) 406 .put("and", Integer.valueOf('\u2227')) 407 .put("or", Integer.valueOf('\u2228')) 408 .put("cap", Integer.valueOf('\u2229')) 409 .put("cup", Integer.valueOf('\u222a')) 410 .put("int", Integer.valueOf('\u222b')) 411 .put("there4", Integer.valueOf('\u2234')) 412 .put("sim", Integer.valueOf('\u223c')) 413 .put("cong", Integer.valueOf('\u2245')) 414 .put("asymp", Integer.valueOf('\u2248')) 415 .put("ne", Integer.valueOf('\u2260')) 416 .put("equiv", Integer.valueOf('\u2261')) 417 .put("le", Integer.valueOf('\u2264')) 418 .put("ge", Integer.valueOf('\u2265')) 419 .put("sub", Integer.valueOf('\u2282')) 420 .put("sup", Integer.valueOf('\u2283')) 421 .put("nsub", Integer.valueOf('\u2284')) 422 .put("sube", Integer.valueOf('\u2286')) 423 .put("supe", Integer.valueOf('\u2287')) 424 .put("oplus", Integer.valueOf('\u2295')) 425 .put("otimes", Integer.valueOf('\u2297')) 426 .put("perp", Integer.valueOf('\u22a5')) 427 .put("sdot", Integer.valueOf('\u22c5')) 428 429 // Miscellaneous Technical 430 .put("lceil", Integer.valueOf('\u2308')) 431 .put("rceil", Integer.valueOf('\u2309')) 432 .put("lfloor", Integer.valueOf('\u230a')) 433 .put("rfloor", Integer.valueOf('\u230b')) 434 .put("lang", Integer.valueOf('\u2329')) 435 .put("rang", Integer.valueOf('\u232a')) 436 437 // Geometric Shapes 438 .put("loz", Integer.valueOf('\u25ca')) 439 440 // Miscellaneous Symbols 441 .put("spades", Integer.valueOf('\u2660')) 442 .put("clubs", Integer.valueOf('\u2663')) 443 .put("hearts", Integer.valueOf('\u2665')) 444 .put("diams", Integer.valueOf('\u2666')) 445 446 // Latin Extended-A 447 .put("OElig", Integer.valueOf('\u0152')) 448 .put("oelig", Integer.valueOf('\u0153')) 449 .put("Scaron", Integer.valueOf('\u0160')) 450 .put("scaron", Integer.valueOf('\u0161')) 451 .put("Yuml", Integer.valueOf('\u0178')) 452 453 // Spacing Modifier Letters 454 .put("circ", Integer.valueOf('\u02c6')) 455 .put("tilde", Integer.valueOf('\u02dc')) 456 457 // General Punctuation 458 .put("ensp", Integer.valueOf('\u2002')) 459 .put("emsp", Integer.valueOf('\u2003')) 460 .put("thinsp", Integer.valueOf('\u2009')) 461 .put("zwnj", Integer.valueOf('\u200c')) 462 .put("zwj", Integer.valueOf('\u200d')) 463 .put("lrm", Integer.valueOf('\u200e')) 464 .put("rlm", Integer.valueOf('\u200f')) 465 .put("ndash", Integer.valueOf('\u2013')) 466 .put("mdash", Integer.valueOf('\u2014')) 467 .put("lsquo", Integer.valueOf('\u2018')) 468 .put("rsquo", Integer.valueOf('\u2019')) 469 .put("sbquo", Integer.valueOf('\u201a')) 470 .put("ldquo", Integer.valueOf('\u201c')) 471 .put("rdquo", Integer.valueOf('\u201d')) 472 .put("bdquo", Integer.valueOf('\u201e')) 473 .put("dagger", Integer.valueOf('\u2020')) 474 .put("Dagger", Integer.valueOf('\u2021')) 475 .put("permil", Integer.valueOf('\u2030')) 476 .put("lsaquo", Integer.valueOf('\u2039')) 477 .put("rsaquo", Integer.valueOf('\u203a')) 478 .put("euro", Integer.valueOf('\u20ac')) 479 .build()); 480 481 private HtmlEntities() { /* uninstantiable */ } 482} 483