/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.clearsilver.jsilver.functions.html; import com.google.clearsilver.jsilver.functions.TextFilter; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Map; /** * This class implements the html_strip function. It removes html tags from text, and expands * numbered and named html entities to their corresponding special characters. */ public class HtmlStripFunction implements TextFilter { // The maximum length of an entity (preceded by an &) private static final int MAX_AMP_LENGTH = 9; // The state the strip function can be, normal, in an amp escaped entity or // inside a html tag. private enum State { DEFAULT, IN_AMP, IN_TAG } // Map of entity names to special characters. private static final Map entityValues; // Initialize the entityName lookup map. static { Map tempMap = new HashMap(); // Html specific characters. tempMap.put("amp", "&"); tempMap.put("quot", "\""); tempMap.put("gt", ">"); tempMap.put("lt", "<"); tempMap.put("agrave", "\u00e0"); tempMap.put("aacute", "\u00e1"); tempMap.put("acirc", "\u00e2"); tempMap.put("atilde", "\u00e3"); tempMap.put("auml", "\u00e4"); tempMap.put("aring", "\u00e5"); tempMap.put("aelig", "\u00e6"); tempMap.put("ccedil", "\u00e7"); tempMap.put("egrave", "\u00e8"); tempMap.put("eacute", "\u00e9"); tempMap.put("ecirc", "\u00ea"); tempMap.put("euml", "\u00eb"); tempMap.put("eth", "\u00f0"); tempMap.put("igrave", "\u00ec"); tempMap.put("iacute", "\u00ed"); tempMap.put("icirc", "\u00ee"); tempMap.put("iuml", "\u00ef"); tempMap.put("ntilde", "\u00f1"); tempMap.put("nbsp", " "); tempMap.put("ograve", "\u00f2"); tempMap.put("oacute", "\u00f3"); tempMap.put("ocirc", "\u00f4"); tempMap.put("otilde", "\u00f5"); tempMap.put("ouml", "\u00f6"); tempMap.put("oslash", "\u00f8"); tempMap.put("szlig", "\u00df"); tempMap.put("thorn", "\u00fe"); tempMap.put("ugrave", "\u00f9"); tempMap.put("uacute", "\u00fa"); tempMap.put("ucirc", "\u00fb"); tempMap.put("uuml", "\u00fc"); tempMap.put("yacute", "\u00fd"); // Clearsilver's Copyright symbol! tempMap.put("copy", "(C)"); // Copy the temporary map to an unmodifiable map for the static lookup. entityValues = Collections.unmodifiableMap(tempMap); } @Override public void filter(String in, Appendable out) throws IOException { char[] inChars = in.toCharArray(); // Holds the contents of an & (amp) entity before its decoded. StringBuilder amp = new StringBuilder(); State state = State.DEFAULT; // Loop over the input string, ignoring tags, and decoding entities. for (int i = 0; i < inChars.length; i++) { char c = inChars[i]; switch (state) { case DEFAULT: switch (c) { case '&': state = State.IN_AMP; break; case '<': state = State.IN_TAG; break; default: // If this is isn't the start of an amp of a tag, treat as plain // text and just output. out.append(c); } break; case IN_TAG: // When in a tag, all input is ignored until the end of the tag. if (c == '>') { state = State.DEFAULT; } break; case IN_AMP: // Semi-colon terminates an entity, try and decode it. if (c == ';') { state = State.DEFAULT; appendDecodedEntityReference(out, amp); amp = new StringBuilder(); } else { if (amp.length() < MAX_AMP_LENGTH) { // If this is not the last character in the input, append to the // amp buffer and continue, if it is the last, dump the buffer // to stop the contents of it being lost. if (i != inChars.length - 1) { amp.append(c); } else { out.append('&').append(amp).append(c); } } else { // More than 8 chars, so not a valid entity, dump as plain text. out.append('&').append(amp).append(c); amp = new StringBuilder(); state = State.DEFAULT; } } break; } } } /** * Attempts to decode the entity provided, if it succeeds appends it to the out string. * * @param out the string builder to add the decoded entity to. * @param entityName to decode. */ private void appendDecodedEntityReference(Appendable out, CharSequence entityName) throws IOException { // All the valid entities are at least two characters long. if (entityName.length() < 2) { return; } entityName = entityName.toString().toLowerCase(); // Numbered entity. if (entityName.charAt(0) == '#') { appendNumberedEntity(out, entityName.subSequence(1, entityName.length())); return; } // If the entity is not a numeric value, try looking it up by name. String entity = entityValues.get(entityName); // If there is an entity by that name add it to the output. if (entity != null) { out.append(entity); } } /** * Appends an entity to a string by numeric code. * * @param out the string to add the entity to. * @param entity the numeric code for the entity as a char sequence. */ private void appendNumberedEntity(Appendable out, CharSequence entity) throws IOException { if (entity.length() != 0) { try { char c; // Hex numbered entities start with x. if (entity.charAt(0) == 'x') { c = (char) Integer.parseInt(entity.subSequence(1, entity.length()).toString(), 16); } else { // If its numbered, but not hex, its decimal. c = (char) Integer.parseInt(entity.toString(), 10); } // Don't append null characters, this is to remain Clearsilver compatible. if (c != '\u0000') { out.append(c); } } catch (NumberFormatException e) { // Do nothing if this is not a valid numbered entity. } } } }