/*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.clearsilver.jsilver.functions.html;
import com.google.clearsilver.jsilver.functions.TextFilter;
import com.google.clearsilver.jsilver.functions.escape.HtmlEscapeFunction;
import com.google.clearsilver.jsilver.functions.escape.SimpleEscapingFunction;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class implements the ClearSilver text_html function.
*
* It converts plain text into html, including adding 'tt' tags to ascii art and linking email and
* web addresses.
*
* Note this implementation differs from ClearSilver, in that it html escapes the contents of links
* and mailtos.
*/
public class TextHtmlFunction implements TextFilter {
// These regular expressions are adapted from html.c in the ClearSilver
// source.
// Regular expression used to match email addresses, taken from the
// ClearSilver source to maintain compatibility.
private static final String EMAIL_REGEXP =
"[^]\\[@:;<>\\\"()\\s\\p{Cntrl}]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
// Regular expression used to match urls without a scheme (www.foo.com),
// adapted from the ClearSilver source to maintain compatibility.
private static final String WITH_SCHEME_REGEXP = "(?:http|https|ftp|mailto):[^\\s>\"]*";
// Regular expression used to match urls with a scheme (http://www.foo.com),
// adapted from the ClearSilver source to maintain compatibility.
private static final String WITHOUT_SCHEME_REGEXP = "www\\.[-a-z0-9\\.]+[^\\s;\">]*";
// Pattern to match any string in the input that is linkable.
private static final Pattern LINKABLES =
Pattern.compile("(" + EMAIL_REGEXP + ")|(" + WITH_SCHEME_REGEXP + ")|("
+ WITHOUT_SCHEME_REGEXP + ")", Pattern.CASE_INSENSITIVE);
// Matching groups for the LINKABLES pattern.
private static final int EMAIL_GROUP = 1;
private static final int WITH_SCHEME_GROUP = 2;
// We don't have access to the global html escaper here, so create a new one.
private final HtmlEscapeFunction htmlEscaper = new HtmlEscapeFunction(false);
// Escapes a small set of non-safe html characters, and does a a very small
// amount of formatting.
private final SimpleEscapingFunction htmlCharEscaper =
new SimpleEscapingFunction(new char[] {'<', '>', '&', '\n', '\r'}) {
@Override
protected String getEscapeString(char c) {
switch (c) {
case '<':
return "<";
case '>':
return ">";
case '&':
return "&";
case '\n':
return "
\n";
case '\r':
return "";
default:
return null;
}
}
};
@Override
public void filter(String in, Appendable out) throws IOException {
boolean hasAsciiArt = hasAsciiArt(in);
// Add 'tt' tag to a string that contains 'ascii-art'.
if (hasAsciiArt) {
out.append("");
}
splitAndConvert(in, out);
if (hasAsciiArt) {
out.append("");
}
}
/**
* Splits the input string into blocks of normal text or linkable text. The linkable text is
* converted into anchor tags before being appended to the output. The normal text is escaped and
* appended to the output.
*/
private void splitAndConvert(String in, Appendable out) throws IOException {
Matcher matcher = LINKABLES.matcher(in);
int end = in.length();
int matchStart;
int matchEnd;
int regionStart = 0;
// Keep looking for email addresses and web links until there are none left.
while (matcher.find()) {
matchStart = matcher.start();
matchEnd = matcher.end();
// Escape all the text from the end of the previous match to the start of
// this match, and append it to the output.
htmlCharEscaper.filter(in.subSequence(regionStart, matchStart).toString(), out);
// Don't include a . or , in the text that is linked.
if (in.charAt(matchEnd - 1) == ',' || in.charAt(matchEnd - 1) == '.') {
matchEnd--;
}
if (matcher.group(EMAIL_GROUP) != null) {
formatEmail(in, matchStart, matchEnd, out);
} else {
formatUrl(in, matchStart, matchEnd,
// Add a scheme if the one wasn't found.
matcher.group(WITH_SCHEME_GROUP) == null, out);
}
regionStart = matchEnd;
}
// Escape the text after the last match, and append it to the output.
htmlCharEscaper.filter(in.substring(regionStart, end), out);
}
/**
* Formats the input sequence into a suitable mailto: anchor tag and appends it to the output.
*
* @param in The string that contains the email.
* @param start The start of the email address in the whole string.
* @param end The end of the email in the whole string.
* @param out The text output that the email address should be appended to.
* @throws IOException
*/
private void formatEmail(String in, int start, int end, Appendable out) throws IOException {
String emailPart = in.substring(start, end);
out.append("");
htmlEscaper.filter(emailPart, out);
out.append("");
}
/**
* Formats the input sequence into a suitable anchor tag and appends it to the output.
*
* @param in The string that contains the url.
* @param start The start of the url in the containing string.
* @param end The end of the url in the containing string.
* @param addScheme true if 'http://' should be added to the anchor.
* @param out The text output that the url should be appended to.
* @throws IOException
*/
private void formatUrl(String in, int start, int end, boolean addScheme, Appendable out)
throws IOException {
String urlPart = in.substring(start, end);
out.append(" ");
htmlEscaper.filter(urlPart, out);
out.append("");
}
/**
* Attempts to detect if a string contains ascii art, whitespace such as tabs will suppress ascii
* art detection.
*
* This method takes its conditions from ClearSilver to maintain compatibility. See
* has_space_formatting in html.c in the ClearSilver source.
*
* @param in The string to analyze for ascii art.
* @return true if it is believed that the string contains ascii art.
*/
private boolean hasAsciiArt(String in) {
int spaces = 0;
int returns = 0;
int asciiArt = 0;
int x = 0;
char[] inChars = in.toCharArray();
int length = in.length();
for (x = 0; x < length; x++) {
switch (inChars[x]) {
case '\t':
return false;
case '\r':
break;
case ' ':
// Ignore spaces after full stops.
if (x == 0 || inChars[x - 1] != '.') {
spaces++;
}
break;
case '\n':
spaces = 0;
returns++;
break;
// Characters to count towards the art total.
case '/':
case '\\':
case '<':
case '>':
case ':':
case '[':
case ']':
case '!':
case '@':
case '#':
case '$':
case '%':
case '^':
case '&':
case '*':
case '(':
case ')':
case '|':
asciiArt++;
if (asciiArt > 3) {
return true;
}
break;
default:
if (returns > 2) {
return false;
}
if (spaces > 2) {
return false;
}
returns = 0;
spaces = 0;
asciiArt = 0;
break;
}
}
return false;
}
}