TextHtmlFunction.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.clearsilver.jsilver.functions.html;
18
19import com.google.clearsilver.jsilver.functions.TextFilter;
20import com.google.clearsilver.jsilver.functions.escape.HtmlEscapeFunction;
21import com.google.clearsilver.jsilver.functions.escape.SimpleEscapingFunction;
22
23import java.io.IOException;
24import java.util.regex.Matcher;
25import java.util.regex.Pattern;
26
27/**
28 * This class implements the ClearSilver text_html function.
29 *
30 * It converts plain text into html, including adding 'tt' tags to ascii art and linking email and
31 * web addresses.
32 *
33 * Note this implementation differs from ClearSilver, in that it html escapes the contents of links
34 * and mailtos.
35 */
36public class TextHtmlFunction implements TextFilter {
37
38  // These regular expressions are adapted from html.c in the ClearSilver
39  // source.
40
41  // Regular expression used to match email addresses, taken from the
42  // ClearSilver source to maintain compatibility.
43  private static final String EMAIL_REGEXP =
44      "[^]\\[@:;<>\\\"()\\s\\p{Cntrl}]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
45
46  // Regular expression used to match urls without a scheme (www.foo.com),
47  // adapted from the ClearSilver source to maintain compatibility.
48  private static final String WITH_SCHEME_REGEXP = "(?:http|https|ftp|mailto):[^\\s>\"]*";
49
50  // Regular expression used to match urls with a scheme (http://www.foo.com),
51  // adapted from the ClearSilver source to maintain compatibility.
52  private static final String WITHOUT_SCHEME_REGEXP = "www\\.[-a-z0-9\\.]+[^\\s;\">]*";
53
54  // Pattern to match any string in the input that is linkable.
55  private static final Pattern LINKABLES =
56      Pattern.compile("(" + EMAIL_REGEXP + ")|(" + WITH_SCHEME_REGEXP + ")|("
57          + WITHOUT_SCHEME_REGEXP + ")", Pattern.CASE_INSENSITIVE);
58
59  // Matching groups for the LINKABLES pattern.
60  private static final int EMAIL_GROUP = 1;
61  private static final int WITH_SCHEME_GROUP = 2;
62
63  // We don't have access to the global html escaper here, so create a new one.
64  private final HtmlEscapeFunction htmlEscaper = new HtmlEscapeFunction(false);
65
66  // Escapes a small set of non-safe html characters, and does a a very small
67  // amount of formatting.
68  private final SimpleEscapingFunction htmlCharEscaper =
69      new SimpleEscapingFunction(new char[] {'<', '>', '&', '\n', '\r'}) {
70
71        @Override
72        protected String getEscapeString(char c) {
73          switch (c) {
74            case '<':
75              return "&lt;";
76            case '>':
77              return "&gt;";
78            case '&':
79              return "&amp;";
80            case '\n':
81              return "<br/>\n";
82            case '\r':
83              return "";
84            default:
85              return null;
86          }
87        }
88
89      };
90
91  @Override
92  public void filter(String in, Appendable out) throws IOException {
93
94    boolean hasAsciiArt = hasAsciiArt(in);
95
96    // Add 'tt' tag to a string that contains 'ascii-art'.
97    if (hasAsciiArt) {
98      out.append("<tt>");
99    }
100
101    splitAndConvert(in, out);
102
103    if (hasAsciiArt) {
104      out.append("</tt>");
105    }
106  }
107
108  /**
109   * Splits the input string into blocks of normal text or linkable text. The linkable text is
110   * converted into anchor tags before being appended to the output. The normal text is escaped and
111   * appended to the output.
112   */
113  private void splitAndConvert(String in, Appendable out) throws IOException {
114    Matcher matcher = LINKABLES.matcher(in);
115    int end = in.length();
116    int matchStart;
117    int matchEnd;
118    int regionStart = 0;
119
120    // Keep looking for email addresses and web links until there are none left.
121    while (matcher.find()) {
122      matchStart = matcher.start();
123      matchEnd = matcher.end();
124
125      // Escape all the text from the end of the previous match to the start of
126      // this match, and append it to the output.
127      htmlCharEscaper.filter(in.subSequence(regionStart, matchStart).toString(), out);
128
129      // Don't include a . or , in the text that is linked.
130      if (in.charAt(matchEnd - 1) == ',' || in.charAt(matchEnd - 1) == '.') {
131        matchEnd--;
132      }
133
134      if (matcher.group(EMAIL_GROUP) != null) {
135        formatEmail(in, matchStart, matchEnd, out);
136      } else {
137        formatUrl(in, matchStart, matchEnd,
138        // Add a scheme if the one wasn't found.
139            matcher.group(WITH_SCHEME_GROUP) == null, out);
140      }
141
142      regionStart = matchEnd;
143    }
144
145    // Escape the text after the last match, and append it to the output.
146    htmlCharEscaper.filter(in.substring(regionStart, end), out);
147  }
148
149  /**
150   * Formats the input sequence into a suitable mailto: anchor tag and appends it to the output.
151   *
152   * @param in The string that contains the email.
153   * @param start The start of the email address in the whole string.
154   * @param end The end of the email in the whole string.
155   * @param out The text output that the email address should be appended to.
156   * @throws IOException
157   */
158  private void formatEmail(String in, int start, int end, Appendable out) throws IOException {
159
160    String emailPart = in.substring(start, end);
161
162    out.append("<a href=\"mailto:");
163    htmlEscaper.filter(emailPart, out);
164    out.append("\">");
165    htmlEscaper.filter(emailPart, out);
166    out.append("</a>");
167  }
168
169  /**
170   * Formats the input sequence into a suitable anchor tag and appends it to the output.
171   *
172   * @param in The string that contains the url.
173   * @param start The start of the url in the containing string.
174   * @param end The end of the url in the containing string.
175   * @param addScheme true if 'http://' should be added to the anchor.
176   * @param out The text output that the url should be appended to.
177   * @throws IOException
178   */
179  private void formatUrl(String in, int start, int end, boolean addScheme, Appendable out)
180      throws IOException {
181
182    String urlPart = in.substring(start, end);
183
184    out.append(" <a target=\"_blank\" href=\"");
185    if (addScheme) {
186      out.append("http://");
187    }
188    htmlEscaper.filter(urlPart, out);
189    out.append("\">");
190    htmlEscaper.filter(urlPart, out);
191    out.append("</a>");
192  }
193
194  /**
195   * Attempts to detect if a string contains ascii art, whitespace such as tabs will suppress ascii
196   * art detection.
197   *
198   * This method takes its conditions from ClearSilver to maintain compatibility. See
199   * has_space_formatting in html.c in the ClearSilver source.
200   *
201   * @param in The string to analyze for ascii art.
202   * @return true if it is believed that the string contains ascii art.
203   */
204  private boolean hasAsciiArt(String in) {
205    int spaces = 0;
206    int returns = 0;
207    int asciiArt = 0;
208    int x = 0;
209    char[] inChars = in.toCharArray();
210
211    int length = in.length();
212    for (x = 0; x < length; x++) {
213
214      switch (inChars[x]) {
215        case '\t':
216          return false;
217
218        case '\r':
219          break;
220
221        case ' ':
222          // Ignore spaces after full stops.
223          if (x == 0 || inChars[x - 1] != '.') {
224            spaces++;
225          }
226          break;
227
228        case '\n':
229          spaces = 0;
230          returns++;
231          break;
232
233        // Characters to count towards the art total.
234        case '/':
235        case '\\':
236        case '<':
237        case '>':
238        case ':':
239        case '[':
240        case ']':
241        case '!':
242        case '@':
243        case '#':
244        case '$':
245        case '%':
246        case '^':
247        case '&':
248        case '*':
249        case '(':
250        case ')':
251        case '|':
252          asciiArt++;
253          if (asciiArt > 3) {
254            return true;
255          }
256          break;
257
258        default:
259          if (returns > 2) {
260            return false;
261          }
262          if (spaces > 2) {
263            return false;
264          }
265          returns = 0;
266          spaces = 0;
267          asciiArt = 0;
268          break;
269      }
270    }
271
272    return false;
273  }
274}
275