1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import java.util.LinkedList;
32import java.util.List;
33import javax.annotation.Nullable;
34
35import com.google.common.collect.Lists;
36
37/**
38 * Consumes an HTML stream, and dispatches events to a policy object which
39 * decides which elements and attributes to allow.
40 */
41public final class HtmlSanitizer {
42
43  /**
44   * Receives events based on the HTML stream, and applies a policy to decide
45   * what HTML constructs to allow.
46   * Typically, implementations use an {@link HtmlStreamRenderer} to produce
47   * the sanitized output.
48   *
49   * <p>
50   * <b>Implementations of this class are in the TCB.</b></p>
51   */
52  @TCB
53  public interface Policy extends HtmlStreamEventReceiver {
54    /**
55     * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
56     *
57     * @param elementName a normalized (lower-case for non-namespaced names)
58     *     element name.
59     * @param attrs a list of alternating attribute name and value pairs.
60     *     For efficiency, this list may be mutated by this during this method
61     *     call, but ownership reverts to the caller on method exit.
62     *     The values are raw -- HTML entities have been decoded.
63     *     Specifically, implementations are allowed to use a list iterator
64     *     and remove all disallowed attributes, add necessary attributes, and
65     *     then pass the list to an {@link HtmlStreamRenderer}.
66     */
67    void openTag(String elementName, List<String> attrs);
68
69    /**
70     * Called when an HTML tag like {@code </foo>} is seen in the input.
71     *
72     * @param elementName a normalized (lower-case for non-namespaced names)
73     *     element name.
74     */
75    void closeTag(String elementName);
76
77    /**
78     * Called when textual content is seen.
79     * @param textChunk raw content -- HTML entities have been decoded.
80     */
81    void text(String textChunk);
82  }
83
84  /**
85   * Sanitizes the given HTML by applying the given policy to it.
86   *
87   * <p>
88   * This method is not in the TCB.
89   *
90   * <p>
91   * This method has no return value since policies are assumed to render things
92   * they accept and do nothing on things they reject.
93   * Use {@link HtmlStreamRenderer} to render content to an output buffer.
94   *
95   * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
96   *     empty string and will not result in a {@code NullPointerException}.
97   * @param policy The Policy that will receive events based on the tokens in
98   *     HTML.  Typically, this policy ends up routing the events to an
99   *     {@link HtmlStreamRenderer} after filtering.
100   *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
101   */
102  public static void sanitize(@Nullable String html, final Policy policy) {
103    if (html == null) { html = ""; }
104
105    TagBalancingHtmlStreamEventReceiver balancer
106        = new TagBalancingHtmlStreamEventReceiver(policy);
107
108    // According to Opera the maximum table nesting depth seen in the wild is
109    // 795, but 99.99% of documents have a table nesting depth of less than 22.
110    // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
111    // document depth of 90 (incl. HTML & BODY).
112    // Obviously table nesting depth is not the same as whole document depth,
113    // but it is the best proxy I have available.
114    // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
115    // the original data.
116
117    // Webkit defines the maximum HTML parser tree depth as 512.
118    // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
119    // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
120
121    // The first number gives us a lower bound on the nesting depth we allow,
122    // 90, and the second gives us an upper bound: 512.
123    // We do not want to bump right up against that limit.
124    // 256 is substantially larger than the lower bound and well clear of the
125    // upper bound.
126    balancer.setNestingLimit(256);
127
128    balancer.openDocument();
129
130    HtmlLexer lexer = new HtmlLexer(html);
131    // Use a linked list so that policies can use Iterator.remove() in an O(1)
132    // way.
133    LinkedList<String> attrs = Lists.newLinkedList();
134    while (lexer.hasNext()) {
135      HtmlToken token = lexer.next();
136      switch (token.type) {
137        case TEXT:
138          balancer.text(
139              Encoding.decodeHtml(html.substring(token.start, token.end)));
140          break;
141        case UNESCAPED:
142          balancer.text(Encoding.stripBannedCodeunits(
143              html.substring(token.start, token.end)));
144          break;
145        case TAGBEGIN:
146          if (html.charAt(token.start + 1) == '/') {  // A close tag.
147            balancer.closeTag(HtmlLexer.canonicalName(
148                html.substring(token.start + 2, token.end)));
149            while (lexer.hasNext()
150                   && lexer.next().type != HtmlTokenType.TAGEND) {
151              // skip tokens until we see a ">"
152            }
153          } else {
154            attrs.clear();
155
156            boolean attrsReadyForName = true;
157            tagBody:
158            while (lexer.hasNext()) {
159              HtmlToken tagBodyToken = lexer.next();
160              switch (tagBodyToken.type) {
161                case ATTRNAME:
162                  if (!attrsReadyForName) {
163                    // Last attribute added was valueless.
164                    attrs.add(attrs.getLast());
165                  } else {
166                    attrsReadyForName = false;
167                  }
168                  attrs.add(HtmlLexer.canonicalName(
169                      html.substring(tagBodyToken.start, tagBodyToken.end)));
170                  break;
171                case ATTRVALUE:
172                  attrs.add(Encoding.decodeHtml(stripQuotes(
173                      html.substring(tagBodyToken.start, tagBodyToken.end))));
174                  attrsReadyForName = true;
175                  break;
176                case TAGEND:
177                  break tagBody;
178                default:
179                  // Just drop anything not recognized
180              }
181            }
182            if (!attrsReadyForName) {
183              attrs.add(attrs.getLast());
184            }
185            balancer.openTag(
186                HtmlLexer.canonicalName(
187                    html.substring(token.start + 1, token.end)),
188                attrs);
189          }
190          break;
191        default:
192          // Ignore comments, XML prologues, processing instructions, and other
193          // stuff that shouldn't show up in the output.
194          break;
195      }
196    }
197
198    balancer.closeDocument();
199  }
200
201  private static String stripQuotes(String encodedAttributeValue) {
202    int n = encodedAttributeValue.length();
203    if (n > 0) {
204      char last = encodedAttributeValue.charAt(n - 1);
205      if (last == '"' || last == '\'') {
206        int start = 0;
207        if (n != 1 && last == encodedAttributeValue.charAt(0)) {
208          start = 1;
209        } else {
210          // Browsers deal with missing left quotes : <img src=foo.png">
211          // but generally do not deal with missing right : <img src="foo.png>
212        }
213        return encodedAttributeValue.substring(start, n - 1);
214      }
215    }
216    return encodedAttributeValue;
217  }
218
219}
220