1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import junit.framework.TestCase;
32
33import javax.annotation.Nullable;
34
35import org.junit.Test;
36
37
38public class HtmlSanitizerTest extends TestCase {
39
40  @Test
41  public static final void testEmpty() throws Exception {
42    assertEquals("", sanitize(""));
43    assertEquals("", sanitize(null));
44  }
45
46  @Test
47  public static final void testSimpleText() throws Exception {
48    assertEquals("hello world", sanitize("hello world"));
49  }
50
51  @Test
52  public static final void testEntities1() throws Exception {
53    assertEquals("<hello world>", sanitize("<hello world>"));
54  }
55
56  @Test
57  public static final void testEntities2() throws Exception {
58    assertEquals("<b>hello <i>world</i></b>",
59                 sanitize("<b>hello <i>world</i></b>"));
60  }
61
62  @Test
63  public static final void testUnknownTagsRemoved() throws Exception {
64    assertEquals("<b>hello <i>world</i></b>",
65                 sanitize("<b>hello <bogus></bogus><i>world</i></b>"));
66  }
67
68  @Test
69  public static final void testUnsafeTagsRemoved() throws Exception {
70    assertEquals("<b>hello <i>world</i></b>",
71                 sanitize("<b>hello <i>world</i>"
72                          + "<script src=foo.js></script></b>"));
73  }
74
75  @Test
76  public static final void testUnsafeAttributesRemoved() throws Exception {
77    assertEquals(
78        "<b>hello <i>world</i></b>",
79        sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>"));
80  }
81
82  @Test
83  public static final void testCruftEscaped() throws Exception {
84    assertEquals("<b>hello <i>world&lt;</i></b> &amp; tomorrow the universe",
85                 sanitize(
86                     "<b>hello <i>world<</i></b> & tomorrow the universe"));
87  }
88
89  @Test
90  public static final void testTagCruftRemoved() throws Exception {
91    assertEquals("<b id=\"p-foo\">hello <i>world&lt;</i></b>",
92                 sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>"));
93  }
94
95  @Test
96  public static final void testIdsAndClassesPrefixed() throws Exception {
97    assertEquals(
98        "<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">"
99        + "hello <i>world&lt;</i></b>",
100        sanitize(
101            "<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>"));
102  }
103
104  @Test
105  public static final void testSpecialCharsInAttributes() throws Exception {
106    assertEquals(
107        "<b title=\"a&lt;b &amp;&amp; c&gt;b\">bar</b>",
108        sanitize("<b title=\"a<b && c>b\">bar</b>"));
109  }
110
111  @Test
112  public static final void testUnclosedTags() throws Exception {
113    assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>",
114                 sanitize("<div id=\"foo\">Bar<br>Baz"));
115  }
116
117  @Test
118  public static final void testUnopenedTags() throws Exception {
119    assertEquals("Foo<b>Bar</b>Baz",
120                 sanitize("Foo<b></select>Bar</b></b>Baz</select>"));
121  }
122
123  @Test
124  public static final void testUnsafeEndTags() throws Exception {
125    assertEquals(
126        "",
127        sanitize(
128            "</meta http-equiv=\"refesh\""
129            + " content=\"1;URL=http://evilgadget.com\">"));
130  }
131
132  @Test
133  public static final void testEmptyEndTags() throws Exception {
134    assertEquals("<input />", sanitize("<input></input>"));
135  }
136
137  @Test
138  public static final void testOnLoadStripped() throws Exception {
139    assertEquals(
140        "<img />",
141        sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>"));
142  }
143
144  @Test
145  public static final void testClosingTagParameters() throws Exception {
146    assertEquals(
147        "<p>Hello world</p>",
148        sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">"));
149  }
150
151  @Test
152  public static final void testOptionalEndTags() throws Exception {
153    // Should not be
154    //     "<ol> <li>A</li> <li>B<li>C </li></li></ol>"
155    // The difference is significant because in the first, the item contains no
156    // space after 'A", but in the third, the item contains 'C' and a space.
157    assertEquals(
158        "<ol><li>A</li><li>B</li><li>C </li></ol>",
159        sanitize("<ol> <li>A</li> <li>B<li>C </ol>"));
160  }
161
162  @Test
163  public static final void testFoldingOfHtmlAndBodyTags() throws Exception {
164    assertEquals(
165        "<p>P 1</p>",
166        sanitize("<html><head><title>Foo</title></head>"
167                 + "<body><p>P 1</p></body></html>"));
168    assertEquals(
169        "Hello",
170        sanitize("<body bgcolor=\"blue\">Hello</body>"));
171    assertEquals(
172        "<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>",
173        sanitize(
174            "<html>"
175            + "<head>"
176            + "<title>Blah</title>"
177            + "<p>Foo</p>"
178            + "</head>"
179            + "<body>"
180            + "<p>One"
181            + "<p>Two</p>"
182            + "Three"
183            + "<p>Four</p>"
184            + "</body>"
185            + "</html>"));
186  }
187
188  @Test
189  public static final void testEmptyAndValuelessAttributes() throws Exception {
190    assertEquals(
191        "<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />",
192        sanitize("<input checked type=checkbox id=\"\" class=>"));
193  }
194
195  @Test
196  public static final void testSgmlShortTags() throws Exception {
197    // We make no attempt to correctly handle SGML short tags since they are
198    // not implemented consistently across browsers, and have been removed from
199    // HTML 5.
200    //
201    // According to http://www.w3.org/QA/2007/10/shorttags.html
202    //      Shorttags - the odd side of HTML 4.01
203    //      ...
204    //      It uses an ill-known feature of SGML called shorthand markup, which
205    //      was authorized in HTML up to HTML 4.01. But what used to be a "cool"
206    //      feature for SGML experts becomes a liability in HTML, where the
207    //      construct is more likely to appear as a typo than as a conscious
208    //      choice.
209    //
210    //      All could be fine if this form typo-that-happens-to-be-legal was
211    //      properly implemented in contemporary HTML user-agents. It is not.
212    assertEquals("<p></p>", sanitize("<p/b/"));  // Short-tag discarded.
213    assertEquals("<p></p>", sanitize("<p<b>"));  // Discard <b attribute
214    assertEquals(
215        // This behavior for short tags is not ideal, but it is safe.
216        "<p href=\"/\">first part of the text&lt;/&gt; second part</p>",
217        sanitize("<p<a href=\"/\">first part of the text</> second part"));
218  }
219
220  @Test
221  public static final void testNul() throws Exception {
222    assertEquals(
223        "<a title="
224        + "\"harmless  SCRIPT&#61;javascript:alert(1) ignored&#61;ignored\">"
225        + "</a>",
226        sanitize(
227            "<A TITLE="
228            + "\"harmless\0  SCRIPT=javascript:alert(1) ignored=ignored\">"
229            ));
230  }
231
232  @Test
233  public static final void testDigitsInAttrNames() throws Exception {
234    // See bug 614 for details.
235    assertEquals(
236        "<div>Hello</div>",
237        sanitize(
238            "<div style1=\"expression(\'alert(1)\")\">Hello</div>"
239            ));
240  }
241
242  @Test
243  public static final void testSupplementaryCodepointEncoding()
244      throws Exception {
245    // &#xd87e;&#xdc1a; is not appropriate.
246    // &#x2f81a; is appropriate as is the unencoded form.
247    assertEquals(
248        "&#x2f81a; | &#x2f81a; | &#x2f81a;",
249        sanitize("&#x2F81A; | \ud87e\udc1a | &#xd87e;&#xdc1a;"));
250  }
251
252  @Test
253  public static final void testDeeplyNestedTagsDoS() throws Exception {
254    String sanitized = sanitize(stringRepeatedTimes("<div>", 20000));
255    int n = sanitized.length() / "<div></div>".length();
256    assertTrue("" + n, 50 <= n && n <= 1000);
257    int middle = n * "<div>".length();
258    assertEquals(sanitized.substring(0, middle),
259                 stringRepeatedTimes("<div>", n));
260    assertEquals(sanitized.substring(middle),
261                 stringRepeatedTimes("</div>", n));
262  }
263
264  @Test
265  public static final void testInnerHTMLIE8() throws Exception {
266    // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
267    // values.  Given
268    //     <div attr="``foo=bar">
269    // we encode &#96; but if JavaScript does:
270    //    nodeA.innerHTML = nodeB.innerHTML;
271    // and nodeB contains the DIV above, then IE8 will produce
272    //     <div attr=``foo=bar>
273    // as the value of nodeB.innerHTML and assign it to nodeA.
274    // IE8's HTML parser treats `` as a blank attribute value and foo=bar
275    // becomes a separate attribute.
276    // Adding a space at the end of the attribute prevents this by forcing
277    // IE8 to put double quotes around the attribute when computing
278    // nodeB.innerHTML.
279    assertEquals(
280        "<div title=\"&#96;&#96;onmouseover&#61;alert(1337) \"></div>",
281        sanitize("<div title=\"``onmouseover=alert(1337)\">"));
282  }
283
284  @Test
285  public static final void testNabobsOfNegativism() throws Exception {
286    // Treating <noscript> as raw-text gains us nothing security-wise.
287    assertEquals("<noscript></noscript>",
288                 sanitize("<noscript><evil></noscript>"));
289    assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
290                 sanitize("<noscript>I <b><3</b> Ponies</noscript>"));
291    assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
292                 sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>"));
293    assertEquals("<noframes>I <b>&lt;3</b> Ponies</noframes>",
294                 sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>"));
295    assertEquals("<noembed>I <b>&lt;3</b> Ponies</noembed>",
296                 sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>"));
297    assertEquals("<noxss>I <b>&lt;3</b> Ponies</noxss>",
298                 sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>"));
299    assertEquals(
300        "&lt;noscript&gt;I &lt;b&gt;&lt;3&lt;/b&gt; Ponies&lt;/noscript&gt;",
301        sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>"));
302  }
303
304  @Test
305  public static final void testNULs() throws Exception {
306    assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>"));
307    assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000"));
308    assertEquals("",               sanitize("\u0000"));
309    assertEquals("<b>Hello, </b>", sanitize("<b>Hello, &#0;</b>"));
310    assertEquals("",               sanitize("&#0;"));
311  }
312
313  @Test
314  public static final void testQMarkMeta() throws Exception {
315    assertEquals(
316        "Hello, <b>World</b>!",
317        sanitize(
318            ""
319            // An XML Prologue.
320            // HTML5 treats it as ignorable content via the bogus comment state.
321            + "<?xml version=\"1\" ?>"
322            + "Hello, "
323            // An XML Processing instruction.
324            // HTML5 treats it as ignorable content via the bogus comment state.
325            + "<?processing instruction?>"
326            + "<b>World"
327            // Appears in HTML copied from outlook.
328            + "<?xml:namespace prefix = o ns = "
329            + "\"urn:schemas-microsoft-com:office:office\" />"
330            + "</b>!"));
331  }
332
333  @Test
334  public static final void testScriptInIframe() throws Exception {
335    assertEquals(
336        "<iframe></iframe>",
337        sanitize(
338            "<iframe>\n"
339            + "  <script>alert(Hi)</script>\n"
340            + "</iframe>"));
341  }
342
343  private static String sanitize(@Nullable String html) throws Exception {
344    StringBuilder sb = new StringBuilder();
345    HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
346        sb,
347        new Handler<String>() {
348          public void handle(String errorMessage) {
349            fail(errorMessage);
350          }
351        });
352
353    HtmlSanitizer.Policy policy = new HtmlPolicyBuilder()
354        // Allow these tags.
355       .allowElements(
356           "a", "b", "br", "div", "i", "iframe", "img", "input", "li",
357           "ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss")
358       // And these attributes.
359       .allowAttributes(
360           "dir", "checked", "class", "href", "id", "target", "title", "type")
361       .globally()
362       // Cleanup IDs and CLASSes and prefix them with p- to move to a separate
363       // name-space.
364       .allowAttributes("id", "class")
365       .matching(
366           new AttributePolicy() {
367            public String apply(
368                String elementName, String attributeName, String value) {
369              return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1")
370                  .replaceAll("\\s+", " ")
371                  .trim();
372            }
373           })
374       .globally()
375       // Don't throw out useless <img> and <input> elements to ease debugging.
376       .allowWithoutAttributes("img", "input")
377       .build(renderer);
378
379    HtmlSanitizer.sanitize(html, policy);
380
381    return sb.toString();
382  }
383
384  private static final String stringRepeatedTimes(String s, int n) {
385    StringBuilder sb = new StringBuilder(s.length() * n);
386    while (--n >= 0) {
387      sb.append(s);
388    }
389    return sb.toString();
390  }
391}
392