1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import com.google.common.collect.ImmutableMap;
32
33/**
34 * Utilities for decoding HTML entities, e.g., {@code &}.
35 */
36class HtmlEntities {
37
38  /**
39   * Decodes any HTML entity at the given location.  This handles both named and
40   * numeric entities.
41   *
42   * @param html HTML text.
43   * @param offset the position of the sequence to decode.
44   * @param limit the last position in chars that could be part of the sequence
45   *    to decode.
46   * @return The offset after the end of the decoded sequence and the decoded
47   *    code-point or code-unit packed into a long.
48   *    The first 32 bits are the offset, and the second 32 bits are a
49   *    code-point or a code-unit.
50   */
51  public static long decodeEntityAt(String html, int offset, int limit) {
52    char ch = html.charAt(offset);
53    if ('&' != ch) {
54      return ((offset + 1L) << 32) | ch;
55    }
56
57    int entityLimit = Math.min(limit, offset + 10);
58    int end = -1;
59    int tail = -1;
60    if (entityLimit == limit) {
61      // Assume a broken entity that ends at the end until shown otherwise.
62      end = tail = entityLimit;
63    }
64    entityloop:
65    for (int i = offset + 1; i < entityLimit; ++i) {
66      switch (html.charAt(i)) {
67        case ';':  // An unbroken entity.
68          end = i;
69          tail = end + 1;
70          break entityloop;
71        case '#':
72        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
73        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
74        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
75        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
76        case 'Y': case 'Z':
77        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
78        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
79        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
80        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
81        case 'y': case 'z':
82        case '0': case '1': case '2': case '3': case '4': case '5':
83        case '6': case '7': case '8': case '9':
84          break;
85        case '=':
86          // An equal sign after an entity missing a closing semicolon should
87          // never have the semicolon inserted since that causes trouble with
88          // parameters in partially encoded URLs.
89          return ((offset + 1L) << 32) | '&';
90        default:  // A possible broken entity.
91          end = i;
92          tail = i;
93          break entityloop;
94      }
95    }
96    if (end < 0 || offset + 2 >= end) {
97      return ((offset + 1L) << 32) | '&';
98    }
99    // Now we know where the entity ends, and that there is at least one
100    // character in the entity name
101    char ch1 = html.charAt(offset + 1);
102    char ch2 = html.charAt(offset + 2);
103    int codepoint = -1;
104    if ('#' == ch1) {
105      // numeric entity
106      if ('x' == ch2 || 'X' == ch2) {
107        if (end == offset + 3) {  // No digits
108          return ((offset + 1L) << 32) | '&';
109        }
110        codepoint = 0;
111        // hex literal
112        digloop:
113        for (int i = offset + 3; i < end; ++i) {
114          char digit = html.charAt(i);
115          switch (digit & 0xfff8) {
116            case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
117              int decDig = digit & 0xf;
118              if (decDig < 10) {
119                codepoint = (codepoint << 4) | decDig;
120              } else {
121                codepoint = -1;
122                break digloop;
123              }
124              break;
125            // ASCII 65-70 and 97-102 are 'A'-'Z' && 'a'-'z'
126            case 0x40: case 0x60:
127              int hexDig = (digit & 0x7);
128              if (hexDig != 0 && hexDig < 7) {
129                codepoint = (codepoint << 4) | (hexDig + 9);
130              } else {
131                codepoint = -1;
132                break digloop;
133              }
134              break;
135            default:
136              codepoint = -1;
137              break digloop;
138          }
139        }
140        if (codepoint > Character.MAX_CODE_POINT) {
141          codepoint = 0xfffd;  // Unknown.
142        }
143      } else {
144        codepoint = 0;
145        // decimal literal
146        digloop:
147        for (int i = offset + 2; i < end; ++i) {
148          char digit = html.charAt(i);
149          switch (digit & 0xfff8) {
150            case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
151              int decDig = digit - '0';
152              if (decDig < 10) {
153                codepoint = (codepoint * 10) + decDig;
154              } else {
155                codepoint = -1;
156                break digloop;
157              }
158              break;
159            default:
160              codepoint = -1;
161              break digloop;
162          }
163        }
164        if (codepoint > Character.MAX_CODE_POINT) {
165          codepoint = 0xfffd;  // Unknown.
166        }
167      }
168    } else {
169      Trie t = ENTITY_TRIE;
170      for (int i = offset + 1; i < end; ++i) {
171        char nameChar = html.charAt(i);
172        t = t.lookup(nameChar);
173        if (t == null) { break; }
174      }
175      if (t == null) {
176        t = ENTITY_TRIE;
177        for (int i = offset + 1; i < end; ++i) {
178          char nameChar = html.charAt(i);
179          if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
180          t = t.lookup(nameChar);
181          if (t == null) { break; }
182        }
183      }
184      if (t != null && t.isTerminal()) {
185        codepoint = t.getValue();
186      }
187    }
188    if (codepoint < 0) {
189      return ((offset + 1L) << 32) | '&';
190    } else {
191      return (((long) tail) << 32) | codepoint;
192    }
193  }
194
195//  /** A possible entity name like "amp" or "gt". */
196//  public static boolean isEntityName(String name) {
197//    Trie t = ENTITY_TRIE;
198//    int n = name.length();
199//
200//    // Treat AMP the same amp, but not Amp.
201//    boolean isUcase = true;
202//    for (int i = 0; i < n; ++i) {
203//      char ch = name.charAt(i);
204//      if (!('A' <= ch && ch <= 'Z')) {
205//        isUcase = false;
206//        break;
207//      }
208//    }
209//
210//    if (isUcase) { name = Strings.toLowerCase(name); }
211//
212//    for (int i = 0; i < n; ++i) {
213//      t = t.lookup(name.charAt(i));
214//      if (t == null) { return false; }
215//    }
216//    return t.isTerminal();
217//  }
218
219  /** A trie that maps entity names to codepoints. */
220  public static final Trie ENTITY_TRIE = new Trie(
221      ImmutableMap.<String, Integer>builder()
222    // C0 Controls and Basic Latin
223      .put("quot", Integer.valueOf('"'))
224      .put("amp", Integer.valueOf('&'))
225      .put("lt", Integer.valueOf('<'))
226      .put("gt", Integer.valueOf('>'))
227
228    // XML 1.0
229      .put("apos", Integer.valueOf('\''))
230
231    // HTML4 entities
232      .put("nbsp", Integer.valueOf('\u00a0'))
233      .put("iexcl", Integer.valueOf('\u00a1'))
234      .put("cent", Integer.valueOf('\u00a2'))
235      .put("pound", Integer.valueOf('\u00a3'))
236      .put("curren", Integer.valueOf('\u00a4'))
237      .put("yen", Integer.valueOf('\u00a5'))
238      .put("brvbar", Integer.valueOf('\u00a6'))
239      .put("sect", Integer.valueOf('\u00a7'))
240      .put("uml", Integer.valueOf('\u00a8'))
241      .put("copy", Integer.valueOf('\u00a9'))
242      .put("ordf", Integer.valueOf('\u00aa'))
243      .put("laquo", Integer.valueOf('\u00ab'))
244      .put("not", Integer.valueOf('\u00ac'))
245      .put("shy", Integer.valueOf('\u00ad'))
246      .put("reg", Integer.valueOf('\u00ae'))
247      .put("macr", Integer.valueOf('\u00af'))
248      .put("deg", Integer.valueOf('\u00b0'))
249      .put("plusmn", Integer.valueOf('\u00b1'))
250      .put("sup2", Integer.valueOf('\u00b2'))
251      .put("sup3", Integer.valueOf('\u00b3'))
252      .put("acute", Integer.valueOf('\u00b4'))
253      .put("micro", Integer.valueOf('\u00b5'))
254      .put("para", Integer.valueOf('\u00b6'))
255      .put("middot", Integer.valueOf('\u00b7'))
256      .put("cedil", Integer.valueOf('\u00b8'))
257      .put("sup1", Integer.valueOf('\u00b9'))
258      .put("ordm", Integer.valueOf('\u00ba'))
259      .put("raquo", Integer.valueOf('\u00bb'))
260      .put("frac14", Integer.valueOf('\u00bc'))
261      .put("frac12", Integer.valueOf('\u00bd'))
262      .put("frac34", Integer.valueOf('\u00be'))
263      .put("iquest", Integer.valueOf('\u00bf'))
264      .put("Agrave", Integer.valueOf('\u00c0'))
265      .put("Aacute", Integer.valueOf('\u00c1'))
266      .put("Acirc", Integer.valueOf('\u00c2'))
267      .put("Atilde", Integer.valueOf('\u00c3'))
268      .put("Auml", Integer.valueOf('\u00c4'))
269      .put("Aring", Integer.valueOf('\u00c5'))
270      .put("AElig", Integer.valueOf('\u00c6'))
271      .put("Ccedil", Integer.valueOf('\u00c7'))
272      .put("Egrave", Integer.valueOf('\u00c8'))
273      .put("Eacute", Integer.valueOf('\u00c9'))
274      .put("Ecirc", Integer.valueOf('\u00ca'))
275      .put("Euml", Integer.valueOf('\u00cb'))
276      .put("Igrave", Integer.valueOf('\u00cc'))
277      .put("Iacute", Integer.valueOf('\u00cd'))
278      .put("Icirc", Integer.valueOf('\u00ce'))
279      .put("Iuml", Integer.valueOf('\u00cf'))
280      .put("ETH", Integer.valueOf('\u00d0'))
281      .put("Ntilde", Integer.valueOf('\u00d1'))
282      .put("Ograve", Integer.valueOf('\u00d2'))
283      .put("Oacute", Integer.valueOf('\u00d3'))
284      .put("Ocirc", Integer.valueOf('\u00d4'))
285      .put("Otilde", Integer.valueOf('\u00d5'))
286      .put("Ouml", Integer.valueOf('\u00d6'))
287      .put("times", Integer.valueOf('\u00d7'))
288      .put("Oslash", Integer.valueOf('\u00d8'))
289      .put("Ugrave", Integer.valueOf('\u00d9'))
290      .put("Uacute", Integer.valueOf('\u00da'))
291      .put("Ucirc", Integer.valueOf('\u00db'))
292      .put("Uuml", Integer.valueOf('\u00dc'))
293      .put("Yacute", Integer.valueOf('\u00dd'))
294      .put("THORN", Integer.valueOf('\u00de'))
295      .put("szlig", Integer.valueOf('\u00df'))
296      .put("agrave", Integer.valueOf('\u00e0'))
297      .put("aacute", Integer.valueOf('\u00e1'))
298      .put("acirc", Integer.valueOf('\u00e2'))
299      .put("atilde", Integer.valueOf('\u00e3'))
300      .put("auml", Integer.valueOf('\u00e4'))
301      .put("aring", Integer.valueOf('\u00e5'))
302      .put("aelig", Integer.valueOf('\u00e6'))
303      .put("ccedil", Integer.valueOf('\u00e7'))
304      .put("egrave", Integer.valueOf('\u00e8'))
305      .put("eacute", Integer.valueOf('\u00e9'))
306      .put("ecirc", Integer.valueOf('\u00ea'))
307      .put("euml", Integer.valueOf('\u00eb'))
308      .put("igrave", Integer.valueOf('\u00ec'))
309      .put("iacute", Integer.valueOf('\u00ed'))
310      .put("icirc", Integer.valueOf('\u00ee'))
311      .put("iuml", Integer.valueOf('\u00ef'))
312      .put("eth", Integer.valueOf('\u00f0'))
313      .put("ntilde", Integer.valueOf('\u00f1'))
314      .put("ograve", Integer.valueOf('\u00f2'))
315      .put("oacute", Integer.valueOf('\u00f3'))
316      .put("ocirc", Integer.valueOf('\u00f4'))
317      .put("otilde", Integer.valueOf('\u00f5'))
318      .put("ouml", Integer.valueOf('\u00f6'))
319      .put("divide", Integer.valueOf('\u00f7'))
320      .put("oslash", Integer.valueOf('\u00f8'))
321      .put("ugrave", Integer.valueOf('\u00f9'))
322      .put("uacute", Integer.valueOf('\u00fa'))
323      .put("ucirc", Integer.valueOf('\u00fb'))
324      .put("uuml", Integer.valueOf('\u00fc'))
325      .put("yacute", Integer.valueOf('\u00fd'))
326      .put("thorn", Integer.valueOf('\u00fe'))
327      .put("yuml", Integer.valueOf('\u00ff'))
328
329    // Latin Extended-B
330      .put("fnof", Integer.valueOf('\u0192'))
331
332    // Greek
333      .put("Alpha", Integer.valueOf('\u0391'))
334      .put("Beta", Integer.valueOf('\u0392'))
335      .put("Gamma", Integer.valueOf('\u0393'))
336      .put("Delta", Integer.valueOf('\u0394'))
337      .put("Epsilon", Integer.valueOf('\u0395'))
338      .put("Zeta", Integer.valueOf('\u0396'))
339      .put("Eta", Integer.valueOf('\u0397'))
340      .put("Theta", Integer.valueOf('\u0398'))
341      .put("Iota", Integer.valueOf('\u0399'))
342      .put("Kappa", Integer.valueOf('\u039a'))
343      .put("Lambda", Integer.valueOf('\u039b'))
344      .put("Mu", Integer.valueOf('\u039c'))
345      .put("Nu", Integer.valueOf('\u039d'))
346      .put("Xi", Integer.valueOf('\u039e'))
347      .put("Omicron", Integer.valueOf('\u039f'))
348      .put("Pi", Integer.valueOf('\u03a0'))
349      .put("Rho", Integer.valueOf('\u03a1'))
350      .put("Sigma", Integer.valueOf('\u03a3'))
351      .put("Tau", Integer.valueOf('\u03a4'))
352      .put("Upsilon", Integer.valueOf('\u03a5'))
353      .put("Phi", Integer.valueOf('\u03a6'))
354      .put("Chi", Integer.valueOf('\u03a7'))
355      .put("Psi", Integer.valueOf('\u03a8'))
356      .put("Omega", Integer.valueOf('\u03a9'))
357
358      .put("alpha", Integer.valueOf('\u03b1'))
359      .put("beta", Integer.valueOf('\u03b2'))
360      .put("gamma", Integer.valueOf('\u03b3'))
361      .put("delta", Integer.valueOf('\u03b4'))
362      .put("epsilon", Integer.valueOf('\u03b5'))
363      .put("zeta", Integer.valueOf('\u03b6'))
364      .put("eta", Integer.valueOf('\u03b7'))
365      .put("theta", Integer.valueOf('\u03b8'))
366      .put("iota", Integer.valueOf('\u03b9'))
367      .put("kappa", Integer.valueOf('\u03ba'))
368      .put("lambda", Integer.valueOf('\u03bb'))
369      .put("mu", Integer.valueOf('\u03bc'))
370      .put("nu", Integer.valueOf('\u03bd'))
371      .put("xi", Integer.valueOf('\u03be'))
372      .put("omicron", Integer.valueOf('\u03bf'))
373      .put("pi", Integer.valueOf('\u03c0'))
374      .put("rho", Integer.valueOf('\u03c1'))
375      .put("sigmaf", Integer.valueOf('\u03c2'))
376      .put("sigma", Integer.valueOf('\u03c3'))
377      .put("tau", Integer.valueOf('\u03c4'))
378      .put("upsilon", Integer.valueOf('\u03c5'))
379      .put("phi", Integer.valueOf('\u03c6'))
380      .put("chi", Integer.valueOf('\u03c7'))
381      .put("psi", Integer.valueOf('\u03c8'))
382      .put("omega", Integer.valueOf('\u03c9'))
383      .put("thetasym", Integer.valueOf('\u03d1'))
384      .put("upsih", Integer.valueOf('\u03d2'))
385      .put("piv", Integer.valueOf('\u03d6'))
386
387    // General Punctuation
388      .put("bull", Integer.valueOf('\u2022'))
389      .put("hellip", Integer.valueOf('\u2026'))
390      .put("prime", Integer.valueOf('\u2032'))
391      .put("Prime", Integer.valueOf('\u2033'))
392      .put("oline", Integer.valueOf('\u203e'))
393      .put("frasl", Integer.valueOf('\u2044'))
394
395    // Letterlike Symbols
396      .put("weierp", Integer.valueOf('\u2118'))
397      .put("image", Integer.valueOf('\u2111'))
398      .put("real", Integer.valueOf('\u211c'))
399      .put("trade", Integer.valueOf('\u2122'))
400      .put("alefsym", Integer.valueOf('\u2135'))
401
402    // Arrows
403      .put("larr", Integer.valueOf('\u2190'))
404      .put("uarr", Integer.valueOf('\u2191'))
405      .put("rarr", Integer.valueOf('\u2192'))
406      .put("darr", Integer.valueOf('\u2193'))
407      .put("harr", Integer.valueOf('\u2194'))
408      .put("crarr", Integer.valueOf('\u21b5'))
409      .put("lArr", Integer.valueOf('\u21d0'))
410      .put("uArr", Integer.valueOf('\u21d1'))
411      .put("rArr", Integer.valueOf('\u21d2'))
412      .put("dArr", Integer.valueOf('\u21d3'))
413      .put("hArr", Integer.valueOf('\u21d4'))
414
415    // Mathematical Operators
416      .put("forall", Integer.valueOf('\u2200'))
417      .put("part", Integer.valueOf('\u2202'))
418      .put("exist", Integer.valueOf('\u2203'))
419      .put("empty", Integer.valueOf('\u2205'))
420      .put("nabla", Integer.valueOf('\u2207'))
421      .put("isin", Integer.valueOf('\u2208'))
422      .put("notin", Integer.valueOf('\u2209'))
423      .put("ni", Integer.valueOf('\u220b'))
424      .put("prod", Integer.valueOf('\u220f'))
425      .put("sum", Integer.valueOf('\u2211'))
426      .put("minus", Integer.valueOf('\u2212'))
427      .put("lowast", Integer.valueOf('\u2217'))
428      .put("radic", Integer.valueOf('\u221a'))
429      .put("prop", Integer.valueOf('\u221d'))
430      .put("infin", Integer.valueOf('\u221e'))
431      .put("ang", Integer.valueOf('\u2220'))
432      .put("and", Integer.valueOf('\u2227'))
433      .put("or", Integer.valueOf('\u2228'))
434      .put("cap", Integer.valueOf('\u2229'))
435      .put("cup", Integer.valueOf('\u222a'))
436      .put("int", Integer.valueOf('\u222b'))
437      .put("there4", Integer.valueOf('\u2234'))
438      .put("sim", Integer.valueOf('\u223c'))
439      .put("cong", Integer.valueOf('\u2245'))
440      .put("asymp", Integer.valueOf('\u2248'))
441      .put("ne", Integer.valueOf('\u2260'))
442      .put("equiv", Integer.valueOf('\u2261'))
443      .put("le", Integer.valueOf('\u2264'))
444      .put("ge", Integer.valueOf('\u2265'))
445      .put("sub", Integer.valueOf('\u2282'))
446      .put("sup", Integer.valueOf('\u2283'))
447      .put("nsub", Integer.valueOf('\u2284'))
448      .put("sube", Integer.valueOf('\u2286'))
449      .put("supe", Integer.valueOf('\u2287'))
450      .put("oplus", Integer.valueOf('\u2295'))
451      .put("otimes", Integer.valueOf('\u2297'))
452      .put("perp", Integer.valueOf('\u22a5'))
453      .put("sdot", Integer.valueOf('\u22c5'))
454
455    // Miscellaneous Technical
456      .put("lceil", Integer.valueOf('\u2308'))
457      .put("rceil", Integer.valueOf('\u2309'))
458      .put("lfloor", Integer.valueOf('\u230a'))
459      .put("rfloor", Integer.valueOf('\u230b'))
460      .put("lang", Integer.valueOf('\u2329'))
461      .put("rang", Integer.valueOf('\u232a'))
462
463    // Geometric Shapes
464      .put("loz", Integer.valueOf('\u25ca'))
465
466    // Miscellaneous Symbols
467      .put("spades", Integer.valueOf('\u2660'))
468      .put("clubs", Integer.valueOf('\u2663'))
469      .put("hearts", Integer.valueOf('\u2665'))
470      .put("diams", Integer.valueOf('\u2666'))
471
472    // Latin Extended-A
473      .put("OElig", Integer.valueOf('\u0152'))
474      .put("oelig", Integer.valueOf('\u0153'))
475      .put("Scaron", Integer.valueOf('\u0160'))
476      .put("scaron", Integer.valueOf('\u0161'))
477      .put("Yuml", Integer.valueOf('\u0178'))
478
479    // Spacing Modifier Letters
480      .put("circ", Integer.valueOf('\u02c6'))
481      .put("tilde", Integer.valueOf('\u02dc'))
482
483    // General Punctuation
484      .put("ensp", Integer.valueOf('\u2002'))
485      .put("emsp", Integer.valueOf('\u2003'))
486      .put("thinsp", Integer.valueOf('\u2009'))
487      .put("zwnj", Integer.valueOf('\u200c'))
488      .put("zwj", Integer.valueOf('\u200d'))
489      .put("lrm", Integer.valueOf('\u200e'))
490      .put("rlm", Integer.valueOf('\u200f'))
491      .put("ndash", Integer.valueOf('\u2013'))
492      .put("mdash", Integer.valueOf('\u2014'))
493      .put("lsquo", Integer.valueOf('\u2018'))
494      .put("rsquo", Integer.valueOf('\u2019'))
495      .put("sbquo", Integer.valueOf('\u201a'))
496      .put("ldquo", Integer.valueOf('\u201c'))
497      .put("rdquo", Integer.valueOf('\u201d'))
498      .put("bdquo", Integer.valueOf('\u201e'))
499      .put("dagger", Integer.valueOf('\u2020'))
500      .put("Dagger", Integer.valueOf('\u2021'))
501      .put("permil", Integer.valueOf('\u2030'))
502      .put("lsaquo", Integer.valueOf('\u2039'))
503      .put("rsaquo", Integer.valueOf('\u203a'))
504      .put("euro", Integer.valueOf('\u20ac'))
505      .build());
506
507  private HtmlEntities() { /* uninstantiable */ }
508}
509