HtmlLexerTest.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import junit.framework.TestCase;
32
33import java.util.Arrays;
34import java.util.List;
35
36import com.google.common.base.Charsets;
37import com.google.common.collect.Lists;
38import com.google.common.io.Resources;
39
40/**
41 *
42 * @author mikesamuel@gmail.com
43 */
44public class HtmlLexerTest extends TestCase {
45
46  public final void testHtmlLexer() throws Exception {
47    // Do the lexing.
48    String input = Resources.toString(
49        Resources.getResource(getClass(), "htmllexerinput1.html"),
50        Charsets.UTF_8);
51    StringBuilder actual = new StringBuilder();
52    lex(input, actual);
53
54    // Get the golden.
55    String golden = Resources.toString(
56        Resources.getResource(getClass(), "htmllexergolden1.txt"),
57        Charsets.UTF_8);
58
59    // Compare.
60    assertEquals(golden, actual.toString());
61  }
62
63  public final void testEofInTag() throws Exception {
64    assertTokens("<div", "TAGBEGIN: <div");
65    assertTokens("</div", "TAGBEGIN: </div");
66    assertTokens("<div\n", "TAGBEGIN: <div");
67    assertTokens("</div\n", "TAGBEGIN: </div");
68    assertTokens("<div", "TAGBEGIN: <div");
69    assertTokens("</div", "TAGBEGIN: </div");
70    assertTokens("<div\n", "TAGBEGIN: <div");
71    assertTokens("</div\n", "TAGBEGIN: </div");
72  }
73
74  public final void testPartialTagInCData() throws Exception {
75    assertTokens(
76        "<script>w('</b')</script>",
77        "TAGBEGIN: <script",
78        "TAGEND: >",
79        "UNESCAPED: w('</b')",
80        "TAGBEGIN: </script",
81        "TAGEND: >");
82  }
83
84  public final void testUrlEndingInSlashOutsideQuotes() throws Exception {
85    assertTokens(
86        "<a href=http://foo.com/>Clicky</a>",
87        "TAGBEGIN: <a",
88        "ATTRNAME: href",
89        "ATTRVALUE: http://foo.com/",
90        "TAGEND: >",
91        "TEXT: Clicky",
92        "TAGBEGIN: </a",
93        "TAGEND: >");
94  }
95
96  public final void testShortTags() throws Exception {
97    // See comments in html-sanitizer-test.js as to why we don't bother with
98    // short tags.  In short, they are not in HTML5 and not implemented properly
99    // in existing HTML4 clients.
100    assertTokens(
101        "<p<a href=\"/\">first part of the text</> second part",
102        "TAGBEGIN: <p",
103        "ATTRNAME: <a",
104        "ATTRNAME: href",
105        "ATTRVALUE: \"/\"",
106        "TAGEND: >",
107        "TEXT: first part of the text</> second part");
108    assertTokens(
109        "<p/b/",
110        "TAGBEGIN: <p",
111        "ATTRNAME: /",
112        "ATTRNAME: b/");
113    assertTokens(
114        "<p<b>",
115        "TAGBEGIN: <p",
116        "ATTRNAME: <b",
117        "TAGEND: >");
118  }
119
120  private void lex(String input, Appendable out) throws Exception {
121    HtmlLexer lexer = new HtmlLexer(input);
122    int maxTypeLength = 0;
123    for (HtmlTokenType t : HtmlTokenType.values()) {
124      maxTypeLength = Math.max(maxTypeLength, t.name().length());
125    }
126
127    while (lexer.hasNext()) {
128      HtmlToken t = lexer.next();
129      // Do C style escaping of the token text so that each token in the golden
130      // file can fit on one line.
131      String escaped = input.substring(t.start, t.end)
132          .replace("\\", "\\\\").replace("\n", "\\n");
133      String type = t.type.toString();
134      int nPadding = maxTypeLength - type.length();
135      out.append(type);
136      while (--nPadding >= 0) { out.append(' '); }
137      out.append(" [").append(escaped).append("]  :  ")
138          .append(String.valueOf(t.start)).append('-')
139          .append(String.valueOf(t.end))
140          .append("\n");
141    }
142  }
143
144  private void assertTokens(String markup, String... golden) {
145    HtmlLexer lexer = new HtmlLexer(markup);
146    List<String> actual = Lists.newArrayList();
147    while (lexer.hasNext()) {
148      HtmlToken t = lexer.next();
149      actual.add(t.type + ": " + markup.substring(t.start, t.end));
150    }
151    assertEquals(Arrays.asList(golden), actual);
152  }
153}
154