HtmlLexerTest.java revision be666032a113a8af92bc557add8e83579cf0ef5c
1// Copyright (c) 2011, Mike Samuel 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions 6// are met: 7// 8// Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// Neither the name of the OWASP nor the names of its contributors may 14// be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27// POSSIBILITY OF SUCH DAMAGE. 28 29package org.owasp.html; 30 31import junit.framework.TestCase; 32 33import java.util.Arrays; 34import java.util.List; 35 36import org.junit.Test; 37 38import com.google.common.base.Charsets; 39import com.google.common.collect.Lists; 40import com.google.common.io.Resources; 41 42public class HtmlLexerTest extends TestCase { 43 44 public final void testHtmlLexer() throws Exception { 45 // Do the lexing. 46 String input = Resources.toString( 47 Resources.getResource(getClass(), "htmllexerinput1.html"), 48 Charsets.UTF_8); 49 StringBuilder actual = new StringBuilder(); 50 lex(input, actual); 51 52 // Get the golden. 53 String golden = Resources.toString( 54 Resources.getResource(getClass(), "htmllexergolden1.txt"), 55 Charsets.UTF_8); 56 57 // Compare. 58 assertEquals(golden, actual.toString()); 59 } 60 61 @Test 62 public static final void testEofInTag() throws Exception { 63 assertTokens("<div", "TAGBEGIN: <div"); 64 assertTokens("</div", "TAGBEGIN: </div"); 65 assertTokens("<div\n", "TAGBEGIN: <div"); 66 assertTokens("</div\n", "TAGBEGIN: </div"); 67 assertTokens("<div", "TAGBEGIN: <div"); 68 assertTokens("</div", "TAGBEGIN: </div"); 69 assertTokens("<div\n", "TAGBEGIN: <div"); 70 assertTokens("</div\n", "TAGBEGIN: </div"); 71 } 72 73 @Test 74 public static final void testPartialTagInCData() throws Exception { 75 assertTokens( 76 "<script>w('</b')</script>", 77 "TAGBEGIN: <script", 78 "TAGEND: >", 79 "UNESCAPED: w('</b')", 80 "TAGBEGIN: </script", 81 "TAGEND: >"); 82 } 83 84 @Test 85 public static final void testUrlEndingInSlashOutsideQuotes() 86 throws Exception { 87 assertTokens( 88 "<a href=http://foo.com/>Clicky</a>", 89 "TAGBEGIN: <a", 90 "ATTRNAME: href", 91 "ATTRVALUE: http://foo.com/", 92 "TAGEND: >", 93 "TEXT: Clicky", 94 "TAGBEGIN: </a", 95 "TAGEND: >"); 96 } 97 98 @Test 99 public static final void testShortTags() throws Exception { 100 // See comments in html-sanitizer-test.js as to why we don't bother with 101 // short tags. In short, they are not in HTML5 and not implemented properly 102 // in existing HTML4 clients. 103 assertTokens( 104 "<p<a href=\"/\">first part of the text</> second part", 105 "TAGBEGIN: <p", 106 "ATTRNAME: <a", 107 "ATTRNAME: href", 108 "ATTRVALUE: \"/\"", 109 "TAGEND: >", 110 "TEXT: first part of the text</> second part"); 111 assertTokens( 112 "<p/b/", 113 "TAGBEGIN: <p", 114 "ATTRNAME: /", 115 "ATTRNAME: b/"); 116 assertTokens( 117 "<p<b>", 118 "TAGBEGIN: <p", 119 "ATTRNAME: <b", 120 "TAGEND: >"); 121 } 122 123 private static void lex(String input, Appendable out) throws Exception { 124 HtmlLexer lexer = new HtmlLexer(input); 125 int maxTypeLength = 0; 126 for (HtmlTokenType t : HtmlTokenType.values()) { 127 maxTypeLength = Math.max(maxTypeLength, t.name().length()); 128 } 129 130 while (lexer.hasNext()) { 131 HtmlToken t = lexer.next(); 132 // Do C style escaping of the token text so that each token in the golden 133 // file can fit on one line. 134 String escaped = input.substring(t.start, t.end) 135 .replace("\\", "\\\\").replace("\n", "\\n"); 136 String type = t.type.toString(); 137 int nPadding = maxTypeLength - type.length(); 138 out.append(type); 139 while (--nPadding >= 0) { out.append(' '); } 140 out.append(" [").append(escaped).append("] : ") 141 .append(String.valueOf(t.start)).append('-') 142 .append(String.valueOf(t.end)) 143 .append("\n"); 144 } 145 } 146 147 private static void assertTokens(String markup, String... golden) { 148 HtmlLexer lexer = new HtmlLexer(markup); 149 List<String> actual = Lists.newArrayList(); 150 while (lexer.hasNext()) { 151 HtmlToken t = lexer.next(); 152 actual.add(t.type + ": " + markup.substring(t.start, t.end)); 153 } 154 assertEquals(Arrays.asList(golden), actual); 155 } 156} 157