// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.LinkedList; import java.util.List; import javax.annotation.Nullable; import com.google.common.collect.Lists; /** * Consumes an HTML stream, and dispatches events to a policy object which * decides which elements and attributes to allow. */ public final class HtmlSanitizer { /** * Receives events based on the HTML stream, and applies a policy to decide * what HTML constructs to allow. * Typically, implementations use an {@link HtmlStreamRenderer} to produce * the sanitized output. * *
* Implementations of this class are in the TCB.
*/ @TCB public interface Policy extends HtmlStreamEventReceiver { /** * Called when an HTML tag like {@code* This method is not in the TCB. * *
* This method has no return value since policies are assumed to render things
* they accept and do nothing on things they reject.
* Use {@link HtmlStreamRenderer} to render content to an output buffer.
*
* @param html A snippet of HTML to sanitize. {@code null} is treated as the
* empty string and will not result in a {@code NullPointerException}.
* @param policy The Policy that will receive events based on the tokens in
* HTML. Typically, this policy ends up routing the events to an
* {@link HtmlStreamRenderer} after filtering.
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
*/
public static void sanitize(@Nullable String html, final Policy policy) {
if (html == null) { html = ""; }
TagBalancingHtmlStreamEventReceiver balancer
= new TagBalancingHtmlStreamEventReceiver(policy);
// According to Opera the maximum table nesting depth seen in the wild is
// 795, but 99.99% of documents have a table nesting depth of less than 22.
// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
// document depth of 90 (incl. HTML & BODY).
// Obviously table nesting depth is not the same as whole document depth,
// but it is the best proxy I have available.
// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
// the original data.
// Webkit defines the maximum HTML parser tree depth as 512.
// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
// The first number gives us a lower bound on the nesting depth we allow,
// 90, and the second gives us an upper bound: 512.
// We do not want to bump right up against that limit.
// 256 is substantially larger than the lower bound and well clear of the
// upper bound.
balancer.setNestingLimit(256);
balancer.openDocument();
HtmlLexer lexer = new HtmlLexer(html);
// Use a linked list so that policies can use Iterator.remove() in an O(1)
// way.
LinkedList