HtmlSanitizer.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel
28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved.
38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without
58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions
68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met:
78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright
98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer.
108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright
118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the
128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution.
138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may
148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software
158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission.
168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE.
288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel
295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html;
305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList;
325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.List;
335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.annotations.VisibleForTesting;
355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists;
365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/**
385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Consumes an HTML stream, and dispatches events to a policy object which
395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * decides which elements and attributes to allow.
405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */
415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compublic final class HtmlSanitizer {
425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Receives events based on the HTML stream, and applies a policy to decide
455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * what HTML constructs to allow.
465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Typically, implementations use an {@link HtmlStreamRenderer} to produce
475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * the sanitized output.
485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <b>Implementations of this class are in the TCB.</b>
515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @TCB
535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  interface Policy extends HtmlStreamEventReceiver {
545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *
575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param elementName a normalized (lower-case for non-namespaced names)
585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     element name.
595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param attrs a list of alternating attribute name and value pairs.
605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     For efficiency, this list may be mutated by this during this method
615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     call, but ownership reverts to the caller on method exit.
625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     The values are raw -- HTML entities have been decoded.
635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     Specifically, implementations are allowed to use a list iterator
645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     and remove all disallowed attributes, add necessary attributes, and
655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     then pass the list to an {@link HtmlStreamRenderer}.
665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void openTag(String elementName, List<String> attrs);
685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when an HTML tag like {@code </foo>} is seen in the input.
715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *
725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param elementName a normalized (lower-case for non-namespaced names)
735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     element name.
745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void closeTag(String elementName);
765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when textual content is seen.
795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param textChunk raw content -- HTML entities have been decoded.
805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void text(String textChunk);
825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Sanitizes the given HTML by applying the given policy to it.
865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This method is not in the TCB.
895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This method has no return value since policies are assumed to render things
925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * they accept and do nothing on things they reject.
935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Use {@link HtmlStreamRenderer} to render content to an output buffer.
945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * @param html The html to sanitize.
965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * @param policy The policy that should receive events based on the .
975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
984e867904c8295537803c1c8a076e130df5674b58mikesamuel  public static void sanitize(String html, final Policy policy) {
995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver(
1005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        policy);
1015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    balancer.openDocument();
1035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlLexer lexer = new HtmlLexer(html);
1055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // Use a linked list so that policies can use Iterator.remove() in an O(1)
1065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // way.
1075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    LinkedList<String> attrs = Lists.newLinkedList();
1085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    while (lexer.hasNext()) {
1095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      HtmlToken token = lexer.next();
1105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      switch (token.type) {
1115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TEXT:
1125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          balancer.text(decodeHtml(html.substring(token.start, token.end)));
1135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case UNESCAPED:
1155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          balancer.text(html.substring(token.start, token.end));
1165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TAGBEGIN:
1185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          if (html.charAt(token.start + 1) == '/') {  // A close tag.
1195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            balancer.closeTag(HtmlLexer.canonicalName(
1205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                html.substring(token.start + 2, token.end)));
1215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            while (lexer.hasNext()
1225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                   && lexer.next().type != HtmlTokenType.TAGEND) {
1235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              // skip tokens until we see a ">"
1245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          } else {
1265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            attrs.clear();
1275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            boolean attrsReadyForName = true;
1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            tagBody:
1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            while (lexer.hasNext()) {
1315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              HtmlToken tagBodyToken = lexer.next();
1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              switch (tagBodyToken.type) {
1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ATTRNAME:
1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if (!attrsReadyForName) {
1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // Last attribute added was valueless.
1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    attrs.add(attrs.getLast());
1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    attrsReadyForName = false;
1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrs.add(HtmlLexer.canonicalName(
1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      html.substring(tagBodyToken.start, tagBodyToken.end)));
1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ATTRVALUE:
1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrs.add(decodeHtml(stripQuotes(
1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      html.substring(tagBodyToken.start, tagBodyToken.end))));
1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrsReadyForName = true;
1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case TAGEND:
1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break tagBody;
1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                default:
1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  // Just drop anything not recognized
1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (!attrsReadyForName) {
1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              attrs.add(attrs.getLast());
1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            balancer.openTag(
1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                HtmlLexer.canonicalName(
1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    html.substring(token.start + 1, token.end)),
1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                attrs);
1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        default:
1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // Ignore comments, directives, and other stuff that shouldn't show
1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // up in the output.
1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
1695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    balancer.closeDocument();
1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1734e867904c8295537803c1c8a076e130df5674b58mikesamuel  private static String stripQuotes(String encodedAttributeValue) {
1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int n = encodedAttributeValue.length();
1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (n > 0) {
1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      char last = encodedAttributeValue.charAt(n - 1);
1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if (last == '"' || last == '\'') {
1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        int start = 0;
1795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (n != 1 && last == encodedAttributeValue.charAt(0)) {
1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          start = 1;
1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        } else {
1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // Browsers deal with missing left quotes : <img src=foo.png">
1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // but generally do not deal with missing right : <img src="foo.png>
1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        return encodedAttributeValue.substring(start, n - 1);
1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return encodedAttributeValue;
1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @VisibleForTesting
1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  static String decodeHtml(String s) {
1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int amp = s.indexOf('&');
1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (amp < 0) { return s; }
1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int pos = 0;
1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int n = s.length();
1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    StringBuilder sb = new StringBuilder(n);
1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int end;
1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    do {
2005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      end = (int) (endAndCodepoint >>> 32);
2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      int codepoint = (int) endAndCodepoint;
2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      sb.append(s, pos, amp).appendCodePoint(codepoint);
2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      pos = end;
2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } while ((amp = s.indexOf('&', end)) >= 0);
2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return sb.append(s, pos, n).toString();
2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com}
210