// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.LinkedList; import java.util.List; import javax.annotation.Nullable; import com.google.common.collect.Lists; /** * Consumes an HTML stream, and dispatches events to a policy object which * decides which elements and attributes to allow. */ public final class HtmlSanitizer { /** * Receives events based on the HTML stream, and applies a policy to decide * what HTML constructs to allow. * Typically, implementations use an {@link HtmlStreamRenderer} to produce * the sanitized output. * *

* Implementations of this class are in the TCB.

*/ @TCB public interface Policy extends HtmlStreamEventReceiver { /** * Called when an HTML tag like {@code } is seen in the input. * * @param elementName a normalized (lower-case for non-namespaced names) * element name. * @param attrs a list of alternating attribute name and value pairs. * For efficiency, this list may be mutated by this during this method * call, but ownership reverts to the caller on method exit. * The values are raw -- HTML entities have been decoded. * Specifically, implementations are allowed to use a list iterator * and remove all disallowed attributes, add necessary attributes, and * then pass the list to an {@link HtmlStreamRenderer}. */ void openTag(String elementName, List attrs); /** * Called when an HTML tag like {@code } is seen in the input. * * @param elementName a normalized (lower-case for non-namespaced names) * element name. */ void closeTag(String elementName); /** * Called when textual content is seen. * @param textChunk raw content -- HTML entities have been decoded. */ void text(String textChunk); } /** * Sanitizes the given HTML by applying the given policy to it. * *

* This method is not in the TCB. * *

* This method has no return value since policies are assumed to render things * they accept and do nothing on things they reject. * Use {@link HtmlStreamRenderer} to render content to an output buffer. * * @param html A snippet of HTML to sanitize. {@code null} is treated as the * empty string and will not result in a {@code NullPointerException}. * @param policy The Policy that will receive events based on the tokens in * HTML. Typically, this policy ends up routing the events to an * {@link HtmlStreamRenderer} after filtering. * {@link HtmlPolicyBuilder} provides an easy way to create policies. */ public static void sanitize(@Nullable String html, final Policy policy) { if (html == null) { html = ""; } TagBalancingHtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver(policy); // According to Opera the maximum table nesting depth seen in the wild is // 795, but 99.99% of documents have a table nesting depth of less than 22. // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a // document depth of 90 (incl. HTML & BODY). // Obviously table nesting depth is not the same as whole document depth, // but it is the best proxy I have available. // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for // the original data. // Webkit defines the maximum HTML parser tree depth as 512. // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408 // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512; // The first number gives us a lower bound on the nesting depth we allow, // 90, and the second gives us an upper bound: 512. // We do not want to bump right up against that limit. // 256 is substantially larger than the lower bound and well clear of the // upper bound. balancer.setNestingLimit(256); balancer.openDocument(); HtmlLexer lexer = new HtmlLexer(html); // Use a linked list so that policies can use Iterator.remove() in an O(1) // way. LinkedList attrs = Lists.newLinkedList(); while (lexer.hasNext()) { HtmlToken token = lexer.next(); switch (token.type) { case TEXT: balancer.text( Encoding.decodeHtml(html.substring(token.start, token.end))); break; case UNESCAPED: balancer.text(Encoding.stripBannedCodeunits( html.substring(token.start, token.end))); break; case TAGBEGIN: if (html.charAt(token.start + 1) == '/') { // A close tag. balancer.closeTag(HtmlLexer.canonicalName( html.substring(token.start + 2, token.end))); while (lexer.hasNext() && lexer.next().type != HtmlTokenType.TAGEND) { // skip tokens until we see a ">" } } else { attrs.clear(); boolean attrsReadyForName = true; tagBody: while (lexer.hasNext()) { HtmlToken tagBodyToken = lexer.next(); switch (tagBodyToken.type) { case ATTRNAME: if (!attrsReadyForName) { // Last attribute added was valueless. attrs.add(attrs.getLast()); } else { attrsReadyForName = false; } attrs.add(HtmlLexer.canonicalName( html.substring(tagBodyToken.start, tagBodyToken.end))); break; case ATTRVALUE: attrs.add(Encoding.decodeHtml(stripQuotes( html.substring(tagBodyToken.start, tagBodyToken.end)))); attrsReadyForName = true; break; case TAGEND: break tagBody; default: // Just drop anything not recognized } } if (!attrsReadyForName) { attrs.add(attrs.getLast()); } balancer.openTag( HtmlLexer.canonicalName( html.substring(token.start + 1, token.end)), attrs); } break; default: // Ignore comments, XML prologues, processing instructions, and other // stuff that shouldn't show up in the output. break; } } balancer.closeDocument(); } private static String stripQuotes(String encodedAttributeValue) { int n = encodedAttributeValue.length(); if (n > 0) { char last = encodedAttributeValue.charAt(n - 1); if (last == '"' || last == '\'') { int start = 0; if (n != 1 && last == encodedAttributeValue.charAt(0)) { start = 1; } else { // Browsers deal with missing left quotes : // but generally do not deal with missing right :