HtmlEntities.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import com.google.common.collect.ImmutableMap;
32
33class HtmlEntities {
34
35  public static long decodeEntityAt(String html, int offset, int limit) {
36    char ch = html.charAt(offset);
37    if ('&' != ch) {
38      return ((offset + 1L) << 32) | ch;
39    }
40
41    int entityLimit = Math.min(limit, offset + 10);
42    int end = -1;
43    int tail = -1;
44    if (entityLimit == limit) {
45      // Assume a broken entity that ends at the end until shown otherwise.
46      end = tail = entityLimit;
47    }
48    entityloop:
49    for (int i = offset + 1; i < entityLimit; ++i) {
50      switch (html.charAt(i)) {
51        case ';':
52          end = i;
53          tail = end + 1;
54          break entityloop;
55        case '#':
56        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
57        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
58        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
59        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
60        case 'Y': case 'Z':
61        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
62        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
63        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
64        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
65        case 'y': case 'z':
66        case '0': case '1': case '2': case '3': case '4': case '5':
67        case '6': case '7': case '8': case '9':
68          break;
69        case '=':
70          return ((offset + 1L) << 32) | '&';
71        default:  // A possible broken entity.
72          end = i;
73          tail = i;
74          break entityloop;
75      }
76    }
77    if (end < 0 || offset + 2 >= end) {
78      return ((offset + 1L) << 32) | '&';
79    }
80    // Now we know where the entity ends, and that there is at least one
81    // character in the entity name
82    char ch1 = html.charAt(offset + 1);
83    char ch2 = html.charAt(offset + 2);
84    int codepoint = -1;
85    if ('#' == ch1) {
86      // numeric entity
87      if ('x' == ch2 || 'X' == ch2) {
88        if (end == offset + 3) {  // No digits
89          return ((offset + 1L) << 32) | '&';
90        }
91        codepoint = 0;
92        // hex literal
93        digloop:
94        for (int i = offset + 3; i < end; ++i) {
95          char digit = html.charAt(i);
96          switch (digit & 0xfff8) {
97            case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
98              int decDig = digit & 0xf;
99              if (decDig < 10) {
100                codepoint = (codepoint << 4) | decDig;
101              } else {
102                codepoint = -1;
103                break digloop;
104              }
105              break;
106            // ASCII 65-70 and 97-102 are 'A'-'Z' && 'a'-'z'
107            case 0x40: case 0x60:
108              int hexDig = (digit & 0x7);
109              if (hexDig != 0 && hexDig < 7) {
110                codepoint = (codepoint << 4) | (hexDig + 9);
111              } else {
112                codepoint = -1;
113                break digloop;
114              }
115              break;
116            default:
117              codepoint = -1;
118              break digloop;
119          }
120        }
121      } else {
122        codepoint = 0;
123        // decimal literal
124        digloop:
125        for (int i = offset + 2; i < end; ++i) {
126          char digit = html.charAt(i);
127          switch (digit & 0xfff8) {
128            case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
129              int decDig = digit - '0';
130              if (decDig < 10) {
131                codepoint = (codepoint * 10) + decDig;
132              } else {
133                codepoint = -1;
134                break digloop;
135              }
136              break;
137            default:
138              codepoint = -1;
139              break digloop;
140          }
141        }
142      }
143    } else {
144      Trie t = ENTITY_TRIE;
145      for (int i = offset + 1; i < end; ++i) {
146        char nameChar = html.charAt(i);
147        t = t.lookup(nameChar);
148        if (t == null) { break; }
149      }
150      if (t == null) {
151        t = ENTITY_TRIE;
152        for (int i = offset + 1; i < end; ++i) {
153          char nameChar = html.charAt(i);
154          if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
155          t = t.lookup(nameChar);
156          if (t == null) { break; }
157        }
158      }
159      if (t != null && t.isTerminal()) {
160        codepoint = t.getValue();
161      }
162    }
163    if (codepoint < 0) {
164      return ((offset + 1L) << 32) | '&';
165    } else {
166      return (((long) tail) << 32) | codepoint;
167    }
168  }
169
170  /** A possible entity name like "amp" or "gt". */
171  public static boolean isEntityName(String name) {
172    Trie t = ENTITY_TRIE;
173    int n = name.length();
174
175    // Treat AMP the same amp, but not Amp.
176    boolean isUcase = true;
177    for (int i = 0; i < n; ++i) {
178      char ch = name.charAt(i);
179      if (!('A' <= ch && ch <= 'Z')) {
180        isUcase = false;
181        break;
182      }
183    }
184
185    if (isUcase) { name = Strings.toLowerCase(name); }
186
187    for (int i = 0; i < n; ++i) {
188      t = t.lookup(name.charAt(i));
189      if (t == null) { return false; }
190    }
191    return t.isTerminal();
192  }
193
194  public static final Trie ENTITY_TRIE = new Trie(
195      ImmutableMap.<String, Integer>builder()
196    // C0 Controls and Basic Latin
197      .put("quot", Integer.valueOf('"'))
198      .put("amp", Integer.valueOf('&'))
199      .put("lt", Integer.valueOf('<'))
200      .put("gt", Integer.valueOf('>'))
201
202    // XML 1.0
203      .put("apos", Integer.valueOf('\''))
204
205    // HTML4 entities
206      .put("nbsp", Integer.valueOf('\u00a0'))
207      .put("iexcl", Integer.valueOf('\u00a1'))
208      .put("cent", Integer.valueOf('\u00a2'))
209      .put("pound", Integer.valueOf('\u00a3'))
210      .put("curren", Integer.valueOf('\u00a4'))
211      .put("yen", Integer.valueOf('\u00a5'))
212      .put("brvbar", Integer.valueOf('\u00a6'))
213      .put("sect", Integer.valueOf('\u00a7'))
214      .put("uml", Integer.valueOf('\u00a8'))
215      .put("copy", Integer.valueOf('\u00a9'))
216      .put("ordf", Integer.valueOf('\u00aa'))
217      .put("laquo", Integer.valueOf('\u00ab'))
218      .put("not", Integer.valueOf('\u00ac'))
219      .put("shy", Integer.valueOf('\u00ad'))
220      .put("reg", Integer.valueOf('\u00ae'))
221      .put("macr", Integer.valueOf('\u00af'))
222      .put("deg", Integer.valueOf('\u00b0'))
223      .put("plusmn", Integer.valueOf('\u00b1'))
224      .put("sup2", Integer.valueOf('\u00b2'))
225      .put("sup3", Integer.valueOf('\u00b3'))
226      .put("acute", Integer.valueOf('\u00b4'))
227      .put("micro", Integer.valueOf('\u00b5'))
228      .put("para", Integer.valueOf('\u00b6'))
229      .put("middot", Integer.valueOf('\u00b7'))
230      .put("cedil", Integer.valueOf('\u00b8'))
231      .put("sup1", Integer.valueOf('\u00b9'))
232      .put("ordm", Integer.valueOf('\u00ba'))
233      .put("raquo", Integer.valueOf('\u00bb'))
234      .put("frac14", Integer.valueOf('\u00bc'))
235      .put("frac12", Integer.valueOf('\u00bd'))
236      .put("frac34", Integer.valueOf('\u00be'))
237      .put("iquest", Integer.valueOf('\u00bf'))
238      .put("Agrave", Integer.valueOf('\u00c0'))
239      .put("Aacute", Integer.valueOf('\u00c1'))
240      .put("Acirc", Integer.valueOf('\u00c2'))
241      .put("Atilde", Integer.valueOf('\u00c3'))
242      .put("Auml", Integer.valueOf('\u00c4'))
243      .put("Aring", Integer.valueOf('\u00c5'))
244      .put("AElig", Integer.valueOf('\u00c6'))
245      .put("Ccedil", Integer.valueOf('\u00c7'))
246      .put("Egrave", Integer.valueOf('\u00c8'))
247      .put("Eacute", Integer.valueOf('\u00c9'))
248      .put("Ecirc", Integer.valueOf('\u00ca'))
249      .put("Euml", Integer.valueOf('\u00cb'))
250      .put("Igrave", Integer.valueOf('\u00cc'))
251      .put("Iacute", Integer.valueOf('\u00cd'))
252      .put("Icirc", Integer.valueOf('\u00ce'))
253      .put("Iuml", Integer.valueOf('\u00cf'))
254      .put("ETH", Integer.valueOf('\u00d0'))
255      .put("Ntilde", Integer.valueOf('\u00d1'))
256      .put("Ograve", Integer.valueOf('\u00d2'))
257      .put("Oacute", Integer.valueOf('\u00d3'))
258      .put("Ocirc", Integer.valueOf('\u00d4'))
259      .put("Otilde", Integer.valueOf('\u00d5'))
260      .put("Ouml", Integer.valueOf('\u00d6'))
261      .put("times", Integer.valueOf('\u00d7'))
262      .put("Oslash", Integer.valueOf('\u00d8'))
263      .put("Ugrave", Integer.valueOf('\u00d9'))
264      .put("Uacute", Integer.valueOf('\u00da'))
265      .put("Ucirc", Integer.valueOf('\u00db'))
266      .put("Uuml", Integer.valueOf('\u00dc'))
267      .put("Yacute", Integer.valueOf('\u00dd'))
268      .put("THORN", Integer.valueOf('\u00de'))
269      .put("szlig", Integer.valueOf('\u00df'))
270      .put("agrave", Integer.valueOf('\u00e0'))
271      .put("aacute", Integer.valueOf('\u00e1'))
272      .put("acirc", Integer.valueOf('\u00e2'))
273      .put("atilde", Integer.valueOf('\u00e3'))
274      .put("auml", Integer.valueOf('\u00e4'))
275      .put("aring", Integer.valueOf('\u00e5'))
276      .put("aelig", Integer.valueOf('\u00e6'))
277      .put("ccedil", Integer.valueOf('\u00e7'))
278      .put("egrave", Integer.valueOf('\u00e8'))
279      .put("eacute", Integer.valueOf('\u00e9'))
280      .put("ecirc", Integer.valueOf('\u00ea'))
281      .put("euml", Integer.valueOf('\u00eb'))
282      .put("igrave", Integer.valueOf('\u00ec'))
283      .put("iacute", Integer.valueOf('\u00ed'))
284      .put("icirc", Integer.valueOf('\u00ee'))
285      .put("iuml", Integer.valueOf('\u00ef'))
286      .put("eth", Integer.valueOf('\u00f0'))
287      .put("ntilde", Integer.valueOf('\u00f1'))
288      .put("ograve", Integer.valueOf('\u00f2'))
289      .put("oacute", Integer.valueOf('\u00f3'))
290      .put("ocirc", Integer.valueOf('\u00f4'))
291      .put("otilde", Integer.valueOf('\u00f5'))
292      .put("ouml", Integer.valueOf('\u00f6'))
293      .put("divide", Integer.valueOf('\u00f7'))
294      .put("oslash", Integer.valueOf('\u00f8'))
295      .put("ugrave", Integer.valueOf('\u00f9'))
296      .put("uacute", Integer.valueOf('\u00fa'))
297      .put("ucirc", Integer.valueOf('\u00fb'))
298      .put("uuml", Integer.valueOf('\u00fc'))
299      .put("yacute", Integer.valueOf('\u00fd'))
300      .put("thorn", Integer.valueOf('\u00fe'))
301      .put("yuml", Integer.valueOf('\u00ff'))
302
303    // Latin Extended-B
304      .put("fnof", Integer.valueOf('\u0192'))
305
306    // Greek
307      .put("Alpha", Integer.valueOf('\u0391'))
308      .put("Beta", Integer.valueOf('\u0392'))
309      .put("Gamma", Integer.valueOf('\u0393'))
310      .put("Delta", Integer.valueOf('\u0394'))
311      .put("Epsilon", Integer.valueOf('\u0395'))
312      .put("Zeta", Integer.valueOf('\u0396'))
313      .put("Eta", Integer.valueOf('\u0397'))
314      .put("Theta", Integer.valueOf('\u0398'))
315      .put("Iota", Integer.valueOf('\u0399'))
316      .put("Kappa", Integer.valueOf('\u039a'))
317      .put("Lambda", Integer.valueOf('\u039b'))
318      .put("Mu", Integer.valueOf('\u039c'))
319      .put("Nu", Integer.valueOf('\u039d'))
320      .put("Xi", Integer.valueOf('\u039e'))
321      .put("Omicron", Integer.valueOf('\u039f'))
322      .put("Pi", Integer.valueOf('\u03a0'))
323      .put("Rho", Integer.valueOf('\u03a1'))
324      .put("Sigma", Integer.valueOf('\u03a3'))
325      .put("Tau", Integer.valueOf('\u03a4'))
326      .put("Upsilon", Integer.valueOf('\u03a5'))
327      .put("Phi", Integer.valueOf('\u03a6'))
328      .put("Chi", Integer.valueOf('\u03a7'))
329      .put("Psi", Integer.valueOf('\u03a8'))
330      .put("Omega", Integer.valueOf('\u03a9'))
331
332      .put("alpha", Integer.valueOf('\u03b1'))
333      .put("beta", Integer.valueOf('\u03b2'))
334      .put("gamma", Integer.valueOf('\u03b3'))
335      .put("delta", Integer.valueOf('\u03b4'))
336      .put("epsilon", Integer.valueOf('\u03b5'))
337      .put("zeta", Integer.valueOf('\u03b6'))
338      .put("eta", Integer.valueOf('\u03b7'))
339      .put("theta", Integer.valueOf('\u03b8'))
340      .put("iota", Integer.valueOf('\u03b9'))
341      .put("kappa", Integer.valueOf('\u03ba'))
342      .put("lambda", Integer.valueOf('\u03bb'))
343      .put("mu", Integer.valueOf('\u03bc'))
344      .put("nu", Integer.valueOf('\u03bd'))
345      .put("xi", Integer.valueOf('\u03be'))
346      .put("omicron", Integer.valueOf('\u03bf'))
347      .put("pi", Integer.valueOf('\u03c0'))
348      .put("rho", Integer.valueOf('\u03c1'))
349      .put("sigmaf", Integer.valueOf('\u03c2'))
350      .put("sigma", Integer.valueOf('\u03c3'))
351      .put("tau", Integer.valueOf('\u03c4'))
352      .put("upsilon", Integer.valueOf('\u03c5'))
353      .put("phi", Integer.valueOf('\u03c6'))
354      .put("chi", Integer.valueOf('\u03c7'))
355      .put("psi", Integer.valueOf('\u03c8'))
356      .put("omega", Integer.valueOf('\u03c9'))
357      .put("thetasym", Integer.valueOf('\u03d1'))
358      .put("upsih", Integer.valueOf('\u03d2'))
359      .put("piv", Integer.valueOf('\u03d6'))
360
361    // General Punctuation
362      .put("bull", Integer.valueOf('\u2022'))
363      .put("hellip", Integer.valueOf('\u2026'))
364      .put("prime", Integer.valueOf('\u2032'))
365      .put("Prime", Integer.valueOf('\u2033'))
366      .put("oline", Integer.valueOf('\u203e'))
367      .put("frasl", Integer.valueOf('\u2044'))
368
369    // Letterlike Symbols
370      .put("weierp", Integer.valueOf('\u2118'))
371      .put("image", Integer.valueOf('\u2111'))
372      .put("real", Integer.valueOf('\u211c'))
373      .put("trade", Integer.valueOf('\u2122'))
374      .put("alefsym", Integer.valueOf('\u2135'))
375
376    // Arrows
377      .put("larr", Integer.valueOf('\u2190'))
378      .put("uarr", Integer.valueOf('\u2191'))
379      .put("rarr", Integer.valueOf('\u2192'))
380      .put("darr", Integer.valueOf('\u2193'))
381      .put("harr", Integer.valueOf('\u2194'))
382      .put("crarr", Integer.valueOf('\u21b5'))
383      .put("lArr", Integer.valueOf('\u21d0'))
384      .put("uArr", Integer.valueOf('\u21d1'))
385      .put("rArr", Integer.valueOf('\u21d2'))
386      .put("dArr", Integer.valueOf('\u21d3'))
387      .put("hArr", Integer.valueOf('\u21d4'))
388
389    // Mathematical Operators
390      .put("forall", Integer.valueOf('\u2200'))
391      .put("part", Integer.valueOf('\u2202'))
392      .put("exist", Integer.valueOf('\u2203'))
393      .put("empty", Integer.valueOf('\u2205'))
394      .put("nabla", Integer.valueOf('\u2207'))
395      .put("isin", Integer.valueOf('\u2208'))
396      .put("notin", Integer.valueOf('\u2209'))
397      .put("ni", Integer.valueOf('\u220b'))
398      .put("prod", Integer.valueOf('\u220f'))
399      .put("sum", Integer.valueOf('\u2211'))
400      .put("minus", Integer.valueOf('\u2212'))
401      .put("lowast", Integer.valueOf('\u2217'))
402      .put("radic", Integer.valueOf('\u221a'))
403      .put("prop", Integer.valueOf('\u221d'))
404      .put("infin", Integer.valueOf('\u221e'))
405      .put("ang", Integer.valueOf('\u2220'))
406      .put("and", Integer.valueOf('\u2227'))
407      .put("or", Integer.valueOf('\u2228'))
408      .put("cap", Integer.valueOf('\u2229'))
409      .put("cup", Integer.valueOf('\u222a'))
410      .put("int", Integer.valueOf('\u222b'))
411      .put("there4", Integer.valueOf('\u2234'))
412      .put("sim", Integer.valueOf('\u223c'))
413      .put("cong", Integer.valueOf('\u2245'))
414      .put("asymp", Integer.valueOf('\u2248'))
415      .put("ne", Integer.valueOf('\u2260'))
416      .put("equiv", Integer.valueOf('\u2261'))
417      .put("le", Integer.valueOf('\u2264'))
418      .put("ge", Integer.valueOf('\u2265'))
419      .put("sub", Integer.valueOf('\u2282'))
420      .put("sup", Integer.valueOf('\u2283'))
421      .put("nsub", Integer.valueOf('\u2284'))
422      .put("sube", Integer.valueOf('\u2286'))
423      .put("supe", Integer.valueOf('\u2287'))
424      .put("oplus", Integer.valueOf('\u2295'))
425      .put("otimes", Integer.valueOf('\u2297'))
426      .put("perp", Integer.valueOf('\u22a5'))
427      .put("sdot", Integer.valueOf('\u22c5'))
428
429    // Miscellaneous Technical
430      .put("lceil", Integer.valueOf('\u2308'))
431      .put("rceil", Integer.valueOf('\u2309'))
432      .put("lfloor", Integer.valueOf('\u230a'))
433      .put("rfloor", Integer.valueOf('\u230b'))
434      .put("lang", Integer.valueOf('\u2329'))
435      .put("rang", Integer.valueOf('\u232a'))
436
437    // Geometric Shapes
438      .put("loz", Integer.valueOf('\u25ca'))
439
440    // Miscellaneous Symbols
441      .put("spades", Integer.valueOf('\u2660'))
442      .put("clubs", Integer.valueOf('\u2663'))
443      .put("hearts", Integer.valueOf('\u2665'))
444      .put("diams", Integer.valueOf('\u2666'))
445
446    // Latin Extended-A
447      .put("OElig", Integer.valueOf('\u0152'))
448      .put("oelig", Integer.valueOf('\u0153'))
449      .put("Scaron", Integer.valueOf('\u0160'))
450      .put("scaron", Integer.valueOf('\u0161'))
451      .put("Yuml", Integer.valueOf('\u0178'))
452
453    // Spacing Modifier Letters
454      .put("circ", Integer.valueOf('\u02c6'))
455      .put("tilde", Integer.valueOf('\u02dc'))
456
457    // General Punctuation
458      .put("ensp", Integer.valueOf('\u2002'))
459      .put("emsp", Integer.valueOf('\u2003'))
460      .put("thinsp", Integer.valueOf('\u2009'))
461      .put("zwnj", Integer.valueOf('\u200c'))
462      .put("zwj", Integer.valueOf('\u200d'))
463      .put("lrm", Integer.valueOf('\u200e'))
464      .put("rlm", Integer.valueOf('\u200f'))
465      .put("ndash", Integer.valueOf('\u2013'))
466      .put("mdash", Integer.valueOf('\u2014'))
467      .put("lsquo", Integer.valueOf('\u2018'))
468      .put("rsquo", Integer.valueOf('\u2019'))
469      .put("sbquo", Integer.valueOf('\u201a'))
470      .put("ldquo", Integer.valueOf('\u201c'))
471      .put("rdquo", Integer.valueOf('\u201d'))
472      .put("bdquo", Integer.valueOf('\u201e'))
473      .put("dagger", Integer.valueOf('\u2020'))
474      .put("Dagger", Integer.valueOf('\u2021'))
475      .put("permil", Integer.valueOf('\u2030'))
476      .put("lsaquo", Integer.valueOf('\u2039'))
477      .put("rsaquo", Integer.valueOf('\u203a'))
478      .put("euro", Integer.valueOf('\u20ac'))
479      .build());
480
481  private HtmlEntities() { /* uninstantiable */ }
482}
483