HtmlLexerTest.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
1// Copyright (c) 2011, Mike Samuel 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions 6// are met: 7// 8// Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// Neither the name of the OWASP nor the names of its contributors may 14// be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27// POSSIBILITY OF SUCH DAMAGE. 28 29package org.owasp.html; 30 31import junit.framework.TestCase; 32 33import java.util.Arrays; 34import java.util.List; 35 36import com.google.common.base.Charsets; 37import com.google.common.collect.Lists; 38import com.google.common.io.Resources; 39 40/** 41 * 42 * @author mikesamuel@gmail.com 43 */ 44public class HtmlLexerTest extends TestCase { 45 46 public final void testHtmlLexer() throws Exception { 47 // Do the lexing. 48 String input = Resources.toString( 49 Resources.getResource(getClass(), "htmllexerinput1.html"), 50 Charsets.UTF_8); 51 StringBuilder actual = new StringBuilder(); 52 lex(input, actual); 53 54 // Get the golden. 55 String golden = Resources.toString( 56 Resources.getResource(getClass(), "htmllexergolden1.txt"), 57 Charsets.UTF_8); 58 59 // Compare. 60 assertEquals(golden, actual.toString()); 61 } 62 63 public final void testEofInTag() throws Exception { 64 assertTokens("<div", "TAGBEGIN: <div"); 65 assertTokens("</div", "TAGBEGIN: </div"); 66 assertTokens("<div\n", "TAGBEGIN: <div"); 67 assertTokens("</div\n", "TAGBEGIN: </div"); 68 assertTokens("<div", "TAGBEGIN: <div"); 69 assertTokens("</div", "TAGBEGIN: </div"); 70 assertTokens("<div\n", "TAGBEGIN: <div"); 71 assertTokens("</div\n", "TAGBEGIN: </div"); 72 } 73 74 public final void testPartialTagInCData() throws Exception { 75 assertTokens( 76 "<script>w('</b')</script>", 77 "TAGBEGIN: <script", 78 "TAGEND: >", 79 "UNESCAPED: w('</b')", 80 "TAGBEGIN: </script", 81 "TAGEND: >"); 82 } 83 84 public final void testUrlEndingInSlashOutsideQuotes() throws Exception { 85 assertTokens( 86 "<a href=http://foo.com/>Clicky</a>", 87 "TAGBEGIN: <a", 88 "ATTRNAME: href", 89 "ATTRVALUE: http://foo.com/", 90 "TAGEND: >", 91 "TEXT: Clicky", 92 "TAGBEGIN: </a", 93 "TAGEND: >"); 94 } 95 96 public final void testShortTags() throws Exception { 97 // See comments in html-sanitizer-test.js as to why we don't bother with 98 // short tags. In short, they are not in HTML5 and not implemented properly 99 // in existing HTML4 clients. 100 assertTokens( 101 "<p<a href=\"/\">first part of the text</> second part", 102 "TAGBEGIN: <p", 103 "ATTRNAME: <a", 104 "ATTRNAME: href", 105 "ATTRVALUE: \"/\"", 106 "TAGEND: >", 107 "TEXT: first part of the text</> second part"); 108 assertTokens( 109 "<p/b/", 110 "TAGBEGIN: <p", 111 "ATTRNAME: /", 112 "ATTRNAME: b/"); 113 assertTokens( 114 "<p<b>", 115 "TAGBEGIN: <p", 116 "ATTRNAME: <b", 117 "TAGEND: >"); 118 } 119 120 private void lex(String input, Appendable out) throws Exception { 121 HtmlLexer lexer = new HtmlLexer(input); 122 int maxTypeLength = 0; 123 for (HtmlTokenType t : HtmlTokenType.values()) { 124 maxTypeLength = Math.max(maxTypeLength, t.name().length()); 125 } 126 127 while (lexer.hasNext()) { 128 HtmlToken t = lexer.next(); 129 // Do C style escaping of the token text so that each token in the golden 130 // file can fit on one line. 131 String escaped = input.substring(t.start, t.end) 132 .replace("\\", "\\\\").replace("\n", "\\n"); 133 String type = t.type.toString(); 134 int nPadding = maxTypeLength - type.length(); 135 out.append(type); 136 while (--nPadding >= 0) { out.append(' '); } 137 out.append(" [").append(escaped).append("] : ") 138 .append(String.valueOf(t.start)).append('-') 139 .append(String.valueOf(t.end)) 140 .append("\n"); 141 } 142 } 143 144 private void assertTokens(String markup, String... golden) { 145 HtmlLexer lexer = new HtmlLexer(markup); 146 List<String> actual = Lists.newArrayList(); 147 while (lexer.hasNext()) { 148 HtmlToken t = lexer.next(); 149 actual.add(t.type + ": " + markup.substring(t.start, t.end)); 150 } 151 assertEquals(Arrays.asList(golden), actual); 152 } 153} 154