HtmlLexerTest.java revision be666032a113a8af92bc557add8e83579cf0ef5c
1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import junit.framework.TestCase;
32
33import java.util.Arrays;
34import java.util.List;
35
36import org.junit.Test;
37
38import com.google.common.base.Charsets;
39import com.google.common.collect.Lists;
40import com.google.common.io.Resources;
41
42public class HtmlLexerTest extends TestCase {
43
44  public final void testHtmlLexer() throws Exception {
45    // Do the lexing.
46    String input = Resources.toString(
47        Resources.getResource(getClass(), "htmllexerinput1.html"),
48        Charsets.UTF_8);
49    StringBuilder actual = new StringBuilder();
50    lex(input, actual);
51
52    // Get the golden.
53    String golden = Resources.toString(
54        Resources.getResource(getClass(), "htmllexergolden1.txt"),
55        Charsets.UTF_8);
56
57    // Compare.
58    assertEquals(golden, actual.toString());
59  }
60
61  @Test
62  public static final void testEofInTag() throws Exception {
63    assertTokens("<div", "TAGBEGIN: <div");
64    assertTokens("</div", "TAGBEGIN: </div");
65    assertTokens("<div\n", "TAGBEGIN: <div");
66    assertTokens("</div\n", "TAGBEGIN: </div");
67    assertTokens("<div", "TAGBEGIN: <div");
68    assertTokens("</div", "TAGBEGIN: </div");
69    assertTokens("<div\n", "TAGBEGIN: <div");
70    assertTokens("</div\n", "TAGBEGIN: </div");
71  }
72
73  @Test
74  public static final void testPartialTagInCData() throws Exception {
75    assertTokens(
76        "<script>w('</b')</script>",
77        "TAGBEGIN: <script",
78        "TAGEND: >",
79        "UNESCAPED: w('</b')",
80        "TAGBEGIN: </script",
81        "TAGEND: >");
82  }
83
84  @Test
85  public static final void testUrlEndingInSlashOutsideQuotes()
86      throws Exception {
87    assertTokens(
88        "<a href=http://foo.com/>Clicky</a>",
89        "TAGBEGIN: <a",
90        "ATTRNAME: href",
91        "ATTRVALUE: http://foo.com/",
92        "TAGEND: >",
93        "TEXT: Clicky",
94        "TAGBEGIN: </a",
95        "TAGEND: >");
96  }
97
98  @Test
99  public static final void testShortTags() throws Exception {
100    // See comments in html-sanitizer-test.js as to why we don't bother with
101    // short tags.  In short, they are not in HTML5 and not implemented properly
102    // in existing HTML4 clients.
103    assertTokens(
104        "<p<a href=\"/\">first part of the text</> second part",
105        "TAGBEGIN: <p",
106        "ATTRNAME: <a",
107        "ATTRNAME: href",
108        "ATTRVALUE: \"/\"",
109        "TAGEND: >",
110        "TEXT: first part of the text</> second part");
111    assertTokens(
112        "<p/b/",
113        "TAGBEGIN: <p",
114        "ATTRNAME: /",
115        "ATTRNAME: b/");
116    assertTokens(
117        "<p<b>",
118        "TAGBEGIN: <p",
119        "ATTRNAME: <b",
120        "TAGEND: >");
121  }
122
123  private static void lex(String input, Appendable out) throws Exception {
124    HtmlLexer lexer = new HtmlLexer(input);
125    int maxTypeLength = 0;
126    for (HtmlTokenType t : HtmlTokenType.values()) {
127      maxTypeLength = Math.max(maxTypeLength, t.name().length());
128    }
129
130    while (lexer.hasNext()) {
131      HtmlToken t = lexer.next();
132      // Do C style escaping of the token text so that each token in the golden
133      // file can fit on one line.
134      String escaped = input.substring(t.start, t.end)
135          .replace("\\", "\\\\").replace("\n", "\\n");
136      String type = t.type.toString();
137      int nPadding = maxTypeLength - type.length();
138      out.append(type);
139      while (--nPadding >= 0) { out.append(' '); }
140      out.append(" [").append(escaped).append("]  :  ")
141          .append(String.valueOf(t.start)).append('-')
142          .append(String.valueOf(t.end))
143          .append("\n");
144    }
145  }
146
147  private static void assertTokens(String markup, String... golden) {
148    HtmlLexer lexer = new HtmlLexer(markup);
149    List<String> actual = Lists.newArrayList();
150    while (lexer.hasNext()) {
151      HtmlToken t = lexer.next();
152      actual.add(t.type + ": " + markup.substring(t.start, t.end));
153    }
154    assertEquals(Arrays.asList(golden), actual);
155  }
156}
157