// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import junit.framework.TestCase; import java.util.Arrays; import java.util.List; import org.junit.Test; import com.google.common.base.Charsets; import com.google.common.collect.Lists; import com.google.common.io.Resources; public class HtmlLexerTest extends TestCase { @Test public final void testHtmlLexer() throws Exception { // Do the lexing. String input = Resources.toString( Resources.getResource(getClass(), "htmllexerinput1.html"), Charsets.UTF_8); StringBuilder actual = new StringBuilder(); lex(input, actual); // Get the golden. String golden = Resources.toString( Resources.getResource(getClass(), "htmllexergolden1.txt"), Charsets.UTF_8); // Compare. assertEquals(golden, actual.toString()); } @Test public static final void testEofInTag() throws Exception { assertTokens("w('", "TAGBEGIN: ", "UNESCAPED: w('"); } @Test public static final void testUrlEndingInSlashOutsideQuotes() throws Exception { assertTokens( "Clicky", "TAGBEGIN: ", "TEXT: Clicky", "TAGBEGIN: "); } @Test public static final void testShortTags() throws Exception { // See comments in html-sanitizer-test.js as to why we don't bother with // short tags. In short, they are not in HTML5 and not implemented properly // in existing HTML4 clients. assertTokens( "first part of the text second part", "TAGBEGIN: ", "TEXT: first part of the text second part"); assertTokens( "

", "TAGBEGIN: "); } private static void lex(String input, Appendable out) throws Exception { HtmlLexer lexer = new HtmlLexer(input); int maxTypeLength = 0; for (HtmlTokenType t : HtmlTokenType.values()) { maxTypeLength = Math.max(maxTypeLength, t.name().length()); } while (lexer.hasNext()) { HtmlToken t = lexer.next(); // Do C style escaping of the token text so that each token in the golden // file can fit on one line. String escaped = input.substring(t.start, t.end) .replace("\\", "\\\\").replace("\n", "\\n"); String type = t.type.toString(); int nPadding = maxTypeLength - type.length(); out.append(type); while (--nPadding >= 0) { out.append(' '); } out.append(" [").append(escaped).append("] : ") .append(String.valueOf(t.start)).append('-') .append(String.valueOf(t.end)) .append("\n"); } } private static void assertTokens(String markup, String... golden) { HtmlLexer lexer = new HtmlLexer(markup); List actual = Lists.newArrayList(); while (lexer.hasNext()) { HtmlToken t = lexer.next(); actual.add(t.type + ": " + markup.substring(t.start, t.end)); } assertEquals(Arrays.asList(golden), actual); } }