// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * A trie used to separate punctuation tokens in a run of non-whitespace * characters by preferring the longest punctuation string possible in a * greedy left-to-right scan. * * @author Mike Samuel */ final class Trie { private final char[] childMap; private final Trie[] children; private final boolean terminal; private final int value; /** * @param elements not empty, non null. */ public Trie(Map elements) { this(sortedUniqEntries(elements), 0); } private Trie(List> elements, int depth) { this(elements, depth, 0, elements.size()); } /** * @param elements not empty, non null. Not modified. * @param depth the depth in the tree. * @param start an index into punctuationStrings of the first string in this * subtree. * @param end an index into punctuationStrings past the last string in this * subtree. */ private Trie( List> elements, int depth, int start, int end) { this.terminal = depth == elements.get(start).getKey().length(); if (this.terminal) { this.value = elements.get(start).getValue(); if (start + 1 == end) { // base case this.childMap = ZERO_CHARS; this.children = ZERO_TRIES; return; } else { ++start; } } else { this.value = Integer.MAX_VALUE; } int childCount = 0; { int last = -1; for (int i = start; i < end; ++i) { char ch = elements.get(i).getKey().charAt(depth); if (ch != last) { ++childCount; last = ch; } } } this.childMap = new char[childCount]; this.children = new Trie[childCount]; int childStart = start; int childIndex = 0; char lastCh = elements.get(start).getKey().charAt(depth); for (int i = start + 1; i < end; ++i) { char ch = elements.get(i).getKey().charAt(depth); if (ch != lastCh) { childMap[childIndex] = lastCh; children[childIndex++] = new Trie( elements, depth + 1, childStart, i); childStart = i; lastCh = ch; } } childMap[childIndex] = lastCh; children[childIndex++] = new Trie(elements, depth + 1, childStart, end); } /** Does this node correspond to a complete string in the input set. */ public boolean isTerminal() { return terminal; } public int getValue() { return value; } /** * The child corresponding to the given character. * @return null if no such trie. */ public Trie lookup(char ch) { int i = Arrays.binarySearch(childMap, ch); return i >= 0 ? children[i] : null; } /** * The descendant of this trie corresponding to the string for this trie * appended with s. * @param s non null. * @return null if no such trie. */ public Trie lookup(CharSequence s) { Trie t = this; for (int i = 0, n = s.length(); i < n; ++i) { t = t.lookup(s.charAt(i)); if (null == t) { break; } } return t; } public boolean contains(char ch) { return Arrays.binarySearch(childMap, ch) >= 0; } private static List> sortedUniqEntries( Map m) { return new ArrayList>( new TreeMap(m).entrySet()); } private static final char[] ZERO_CHARS = new char[0]; private static final Trie[] ZERO_TRIES = new Trie[0]; /** * Append all strings s such that {@code this.lookup(s).isTerminal()} to the * given list in lexical order. */ public void toStringList(List strings) { toStringList("", strings); } private void toStringList(String prefix, List strings) { if (terminal) { strings.add(prefix); } for (int i = 0, n = childMap.length; i < n; ++i) { children[i].toStringList(prefix + childMap[i], strings); } } @Override public String toString() { StringBuilder sb = new StringBuilder(); toStringBuilder(0, sb); return sb.toString(); } private void toStringBuilder(int depth, StringBuilder sb) { sb.append(terminal ? "terminal" : "nonterminal"); ++depth; for (int i = 0; i < childMap.length; ++i) { sb.append('\n'); for (int d = 0; d < depth; ++d) { sb.append('\t'); } sb.append('\'').append(childMap[i]).append("' "); children[i].toStringBuilder(depth, sb); } } }