1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import junit.framework.TestCase;
32
33import java.util.Arrays;
34import java.util.List;
35
36import org.junit.Test;
37
38import com.google.common.base.Charsets;
39import com.google.common.collect.Lists;
40import com.google.common.io.Resources;
41
42public class HtmlLexerTest extends TestCase {
43
44  @Test
45  public final void testHtmlLexer() throws Exception {
46    // Do the lexing.
47    String input = Resources.toString(
48        Resources.getResource(getClass(), "htmllexerinput1.html"),
49        Charsets.UTF_8);
50    StringBuilder actual = new StringBuilder();
51    lex(input, actual);
52
53    // Get the golden.
54    String golden = Resources.toString(
55        Resources.getResource(getClass(), "htmllexergolden1.txt"),
56        Charsets.UTF_8);
57
58    // Compare.
59    assertEquals(golden, actual.toString());
60  }
61
62  @Test
63  public static final void testEofInTag() throws Exception {
64    assertTokens("<div", "TAGBEGIN: <div");
65    assertTokens("</div", "TAGBEGIN: </div");
66    assertTokens("<div\n", "TAGBEGIN: <div");
67    assertTokens("</div\n", "TAGBEGIN: </div");
68    assertTokens("<div", "TAGBEGIN: <div");
69    assertTokens("</div", "TAGBEGIN: </div");
70    assertTokens("<div\n", "TAGBEGIN: <div");
71    assertTokens("</div\n", "TAGBEGIN: </div");
72  }
73
74  @Test
75  public static final void testPartialTagInCData() throws Exception {
76    assertTokens(
77        "<script>w('</b')</script>",
78        "TAGBEGIN: <script",
79        "TAGEND: >",
80        "UNESCAPED: w('</b')",
81        "TAGBEGIN: </script",
82        "TAGEND: >");
83  }
84
85  @Test
86  public static final void testUrlEndingInSlashOutsideQuotes()
87      throws Exception {
88    assertTokens(
89        "<a href=http://foo.com/>Clicky</a>",
90        "TAGBEGIN: <a",
91        "ATTRNAME: href",
92        "ATTRVALUE: http://foo.com/",
93        "TAGEND: >",
94        "TEXT: Clicky",
95        "TAGBEGIN: </a",
96        "TAGEND: >");
97  }
98
99  @Test
100  public static final void testShortTags() throws Exception {
101    // See comments in html-sanitizer-test.js as to why we don't bother with
102    // short tags.  In short, they are not in HTML5 and not implemented properly
103    // in existing HTML4 clients.
104    assertTokens(
105        "<p<a href=\"/\">first part of the text</> second part",
106        "TAGBEGIN: <p",
107        "ATTRNAME: <a",
108        "ATTRNAME: href",
109        "ATTRVALUE: \"/\"",
110        "TAGEND: >",
111        "TEXT: first part of the text</> second part");
112    assertTokens(
113        "<p/b/",
114        "TAGBEGIN: <p",
115        "ATTRNAME: /",
116        "ATTRNAME: b/");
117    assertTokens(
118        "<p<b>",
119        "TAGBEGIN: <p",
120        "ATTRNAME: <b",
121        "TAGEND: >");
122  }
123
124  private static void lex(String input, Appendable out) throws Exception {
125    HtmlLexer lexer = new HtmlLexer(input);
126    int maxTypeLength = 0;
127    for (HtmlTokenType t : HtmlTokenType.values()) {
128      maxTypeLength = Math.max(maxTypeLength, t.name().length());
129    }
130
131    while (lexer.hasNext()) {
132      HtmlToken t = lexer.next();
133      // Do C style escaping of the token text so that each token in the golden
134      // file can fit on one line.
135      String escaped = input.substring(t.start, t.end)
136          .replace("\\", "\\\\").replace("\n", "\\n");
137      String type = t.type.toString();
138      int nPadding = maxTypeLength - type.length();
139      out.append(type);
140      while (--nPadding >= 0) { out.append(' '); }
141      out.append(" [").append(escaped).append("]  :  ")
142          .append(String.valueOf(t.start)).append('-')
143          .append(String.valueOf(t.end))
144          .append("\n");
145    }
146  }
147
148  private static void assertTokens(String markup, String... golden) {
149    HtmlLexer lexer = new HtmlLexer(markup);
150    List<String> actual = Lists.newArrayList();
151    while (lexer.hasNext()) {
152      HtmlToken t = lexer.next();
153      actual.add(t.type + ": " + markup.substring(t.start, t.end));
154    }
155    assertEquals(Arrays.asList(golden), actual);
156  }
157}
158