1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.emailcommon.utility;
18
19import com.google.common.annotations.VisibleForTesting;
20
21import android.graphics.Color;
22import android.text.Spannable;
23import android.text.SpannableString;
24import android.text.SpannableStringBuilder;
25import android.text.TextUtils;
26import android.text.style.BackgroundColorSpan;
27
28import java.io.IOException;
29import java.util.ArrayList;
30import java.util.HashMap;
31import java.util.Map;
32import java.util.StringTokenizer;
33
34public class TextUtilities {
35    // Highlight color is yellow, as in other apps.
36    // TODO Push for this to be a global (style-related?) constant
37    public static final int HIGHLIGHT_COLOR_INT = Color.YELLOW;
38    // We AND off the "alpha" from the color (i.e. 0xFFFFFF00 -> 0x00FFFF00)
39    /*package*/ static final String HIGHLIGHT_COLOR_STRING =
40        '#' + Integer.toHexString(HIGHLIGHT_COLOR_INT & 0x00FFFFFF);
41
42    // This is how many chars we'll allow in a snippet
43    private static final int MAX_SNIPPET_LENGTH = 200;
44    // For some reason, isWhitespace() returns false with the following...
45    /*package*/ static final char NON_BREAKING_SPACE_CHARACTER = (char)160;
46
47    // Tags whose content must be stripped as well
48    static final String[] STRIP_TAGS =
49        new String[] {"title", "script", "style", "applet", "head"};
50    // The number of characters we peel off for testing against STRIP_TAGS; this should be the
51    // maximum size of the strings in STRIP_TAGS
52    static final int MAX_STRIP_TAG_LENGTH = 6;
53
54    static final Map<String, Character> ESCAPE_STRINGS;
55    static {
56        // HTML character entity references as defined in HTML 4
57        // see http://www.w3.org/TR/REC-html40/sgml/entities.html
58        ESCAPE_STRINGS = new HashMap<String, Character>(252);
59
60        ESCAPE_STRINGS.put("&nbsp", '\u00A0');
61        ESCAPE_STRINGS.put("&iexcl", '\u00A1');
62        ESCAPE_STRINGS.put("&cent", '\u00A2');
63        ESCAPE_STRINGS.put("&pound", '\u00A3');
64        ESCAPE_STRINGS.put("&curren", '\u00A4');
65        ESCAPE_STRINGS.put("&yen", '\u00A5');
66        ESCAPE_STRINGS.put("&brvbar", '\u00A6');
67        ESCAPE_STRINGS.put("&sect", '\u00A7');
68        ESCAPE_STRINGS.put("&uml", '\u00A8');
69        ESCAPE_STRINGS.put("&copy", '\u00A9');
70        ESCAPE_STRINGS.put("&ordf", '\u00AA');
71        ESCAPE_STRINGS.put("&laquo", '\u00AB');
72        ESCAPE_STRINGS.put("&not", '\u00AC');
73        ESCAPE_STRINGS.put("&shy", '\u00AD');
74        ESCAPE_STRINGS.put("&reg", '\u00AE');
75        ESCAPE_STRINGS.put("&macr", '\u00AF');
76        ESCAPE_STRINGS.put("&deg", '\u00B0');
77        ESCAPE_STRINGS.put("&plusmn", '\u00B1');
78        ESCAPE_STRINGS.put("&sup2", '\u00B2');
79        ESCAPE_STRINGS.put("&sup3", '\u00B3');
80        ESCAPE_STRINGS.put("&acute", '\u00B4');
81        ESCAPE_STRINGS.put("&micro", '\u00B5');
82        ESCAPE_STRINGS.put("&para", '\u00B6');
83        ESCAPE_STRINGS.put("&middot", '\u00B7');
84        ESCAPE_STRINGS.put("&cedil", '\u00B8');
85        ESCAPE_STRINGS.put("&sup1", '\u00B9');
86        ESCAPE_STRINGS.put("&ordm", '\u00BA');
87        ESCAPE_STRINGS.put("&raquo", '\u00BB');
88        ESCAPE_STRINGS.put("&frac14", '\u00BC');
89        ESCAPE_STRINGS.put("&frac12", '\u00BD');
90        ESCAPE_STRINGS.put("&frac34", '\u00BE');
91        ESCAPE_STRINGS.put("&iquest", '\u00BF');
92        ESCAPE_STRINGS.put("&Agrave", '\u00C0');
93        ESCAPE_STRINGS.put("&Aacute", '\u00C1');
94        ESCAPE_STRINGS.put("&Acirc", '\u00C2');
95        ESCAPE_STRINGS.put("&Atilde", '\u00C3');
96        ESCAPE_STRINGS.put("&Auml", '\u00C4');
97        ESCAPE_STRINGS.put("&Aring", '\u00C5');
98        ESCAPE_STRINGS.put("&AElig", '\u00C6');
99        ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
100        ESCAPE_STRINGS.put("&Egrave", '\u00C8');
101        ESCAPE_STRINGS.put("&Eacute", '\u00C9');
102        ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
103        ESCAPE_STRINGS.put("&Euml", '\u00CB');
104        ESCAPE_STRINGS.put("&Igrave", '\u00CC');
105        ESCAPE_STRINGS.put("&Iacute", '\u00CD');
106        ESCAPE_STRINGS.put("&Icirc", '\u00CE');
107        ESCAPE_STRINGS.put("&Iuml", '\u00CF');
108        ESCAPE_STRINGS.put("&ETH", '\u00D0');
109        ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
110        ESCAPE_STRINGS.put("&Ograve", '\u00D2');
111        ESCAPE_STRINGS.put("&Oacute", '\u00D3');
112        ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
113        ESCAPE_STRINGS.put("&Otilde", '\u00D5');
114        ESCAPE_STRINGS.put("&Ouml", '\u00D6');
115        ESCAPE_STRINGS.put("&times", '\u00D7');
116        ESCAPE_STRINGS.put("&Oslash", '\u00D8');
117        ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
118        ESCAPE_STRINGS.put("&Uacute", '\u00DA');
119        ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
120        ESCAPE_STRINGS.put("&Uuml", '\u00DC');
121        ESCAPE_STRINGS.put("&Yacute", '\u00DD');
122        ESCAPE_STRINGS.put("&THORN", '\u00DE');
123        ESCAPE_STRINGS.put("&szlig", '\u00DF');
124        ESCAPE_STRINGS.put("&agrave", '\u00E0');
125        ESCAPE_STRINGS.put("&aacute", '\u00E1');
126        ESCAPE_STRINGS.put("&acirc", '\u00E2');
127        ESCAPE_STRINGS.put("&atilde", '\u00E3');
128        ESCAPE_STRINGS.put("&auml", '\u00E4');
129        ESCAPE_STRINGS.put("&aring", '\u00E5');
130        ESCAPE_STRINGS.put("&aelig", '\u00E6');
131        ESCAPE_STRINGS.put("&ccedil", '\u00E7');
132        ESCAPE_STRINGS.put("&egrave", '\u00E8');
133        ESCAPE_STRINGS.put("&eacute", '\u00E9');
134        ESCAPE_STRINGS.put("&ecirc", '\u00EA');
135        ESCAPE_STRINGS.put("&euml", '\u00EB');
136        ESCAPE_STRINGS.put("&igrave", '\u00EC');
137        ESCAPE_STRINGS.put("&iacute", '\u00ED');
138        ESCAPE_STRINGS.put("&icirc", '\u00EE');
139        ESCAPE_STRINGS.put("&iuml", '\u00EF');
140        ESCAPE_STRINGS.put("&eth", '\u00F0');
141        ESCAPE_STRINGS.put("&ntilde", '\u00F1');
142        ESCAPE_STRINGS.put("&ograve", '\u00F2');
143        ESCAPE_STRINGS.put("&oacute", '\u00F3');
144        ESCAPE_STRINGS.put("&ocirc", '\u00F4');
145        ESCAPE_STRINGS.put("&otilde", '\u00F5');
146        ESCAPE_STRINGS.put("&ouml", '\u00F6');
147        ESCAPE_STRINGS.put("&divide", '\u00F7');
148        ESCAPE_STRINGS.put("&oslash", '\u00F8');
149        ESCAPE_STRINGS.put("&ugrave", '\u00F9');
150        ESCAPE_STRINGS.put("&uacute", '\u00FA');
151        ESCAPE_STRINGS.put("&ucirc", '\u00FB');
152        ESCAPE_STRINGS.put("&uuml", '\u00FC');
153        ESCAPE_STRINGS.put("&yacute", '\u00FD');
154        ESCAPE_STRINGS.put("&thorn", '\u00FE');
155        ESCAPE_STRINGS.put("&yuml", '\u00FF');
156        ESCAPE_STRINGS.put("&fnof", '\u0192');
157        ESCAPE_STRINGS.put("&Alpha", '\u0391');
158        ESCAPE_STRINGS.put("&Beta", '\u0392');
159        ESCAPE_STRINGS.put("&Gamma", '\u0393');
160        ESCAPE_STRINGS.put("&Delta", '\u0394');
161        ESCAPE_STRINGS.put("&Epsilon", '\u0395');
162        ESCAPE_STRINGS.put("&Zeta", '\u0396');
163        ESCAPE_STRINGS.put("&Eta", '\u0397');
164        ESCAPE_STRINGS.put("&Theta", '\u0398');
165        ESCAPE_STRINGS.put("&Iota", '\u0399');
166        ESCAPE_STRINGS.put("&Kappa", '\u039A');
167        ESCAPE_STRINGS.put("&Lambda", '\u039B');
168        ESCAPE_STRINGS.put("&Mu", '\u039C');
169        ESCAPE_STRINGS.put("&Nu", '\u039D');
170        ESCAPE_STRINGS.put("&Xi", '\u039E');
171        ESCAPE_STRINGS.put("&Omicron", '\u039F');
172        ESCAPE_STRINGS.put("&Pi", '\u03A0');
173        ESCAPE_STRINGS.put("&Rho", '\u03A1');
174        ESCAPE_STRINGS.put("&Sigma", '\u03A3');
175        ESCAPE_STRINGS.put("&Tau", '\u03A4');
176        ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
177        ESCAPE_STRINGS.put("&Phi", '\u03A6');
178        ESCAPE_STRINGS.put("&Chi", '\u03A7');
179        ESCAPE_STRINGS.put("&Psi", '\u03A8');
180        ESCAPE_STRINGS.put("&Omega", '\u03A9');
181        ESCAPE_STRINGS.put("&alpha", '\u03B1');
182        ESCAPE_STRINGS.put("&beta", '\u03B2');
183        ESCAPE_STRINGS.put("&gamma", '\u03B3');
184        ESCAPE_STRINGS.put("&delta", '\u03B4');
185        ESCAPE_STRINGS.put("&epsilon", '\u03B5');
186        ESCAPE_STRINGS.put("&zeta", '\u03B6');
187        ESCAPE_STRINGS.put("&eta", '\u03B7');
188        ESCAPE_STRINGS.put("&theta", '\u03B8');
189        ESCAPE_STRINGS.put("&iota", '\u03B9');
190        ESCAPE_STRINGS.put("&kappa", '\u03BA');
191        ESCAPE_STRINGS.put("&lambda", '\u03BB');
192        ESCAPE_STRINGS.put("&mu", '\u03BC');
193        ESCAPE_STRINGS.put("&nu", '\u03BD');
194        ESCAPE_STRINGS.put("&xi", '\u03BE');
195        ESCAPE_STRINGS.put("&omicron", '\u03BF');
196        ESCAPE_STRINGS.put("&pi", '\u03C0');
197        ESCAPE_STRINGS.put("&rho", '\u03C1');
198        ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
199        ESCAPE_STRINGS.put("&sigma", '\u03C3');
200        ESCAPE_STRINGS.put("&tau", '\u03C4');
201        ESCAPE_STRINGS.put("&upsilon", '\u03C5');
202        ESCAPE_STRINGS.put("&phi", '\u03C6');
203        ESCAPE_STRINGS.put("&chi", '\u03C7');
204        ESCAPE_STRINGS.put("&psi", '\u03C8');
205        ESCAPE_STRINGS.put("&omega", '\u03C9');
206        ESCAPE_STRINGS.put("&thetasym", '\u03D1');
207        ESCAPE_STRINGS.put("&upsih", '\u03D2');
208        ESCAPE_STRINGS.put("&piv", '\u03D6');
209        ESCAPE_STRINGS.put("&bull", '\u2022');
210        ESCAPE_STRINGS.put("&hellip", '\u2026');
211        ESCAPE_STRINGS.put("&prime", '\u2032');
212        ESCAPE_STRINGS.put("&Prime", '\u2033');
213        ESCAPE_STRINGS.put("&oline", '\u203E');
214        ESCAPE_STRINGS.put("&frasl", '\u2044');
215        ESCAPE_STRINGS.put("&weierp", '\u2118');
216        ESCAPE_STRINGS.put("&image", '\u2111');
217        ESCAPE_STRINGS.put("&real", '\u211C');
218        ESCAPE_STRINGS.put("&trade", '\u2122');
219        ESCAPE_STRINGS.put("&alefsym", '\u2135');
220        ESCAPE_STRINGS.put("&larr", '\u2190');
221        ESCAPE_STRINGS.put("&uarr", '\u2191');
222        ESCAPE_STRINGS.put("&rarr", '\u2192');
223        ESCAPE_STRINGS.put("&darr", '\u2193');
224        ESCAPE_STRINGS.put("&harr", '\u2194');
225        ESCAPE_STRINGS.put("&crarr", '\u21B5');
226        ESCAPE_STRINGS.put("&lArr", '\u21D0');
227        ESCAPE_STRINGS.put("&uArr", '\u21D1');
228        ESCAPE_STRINGS.put("&rArr", '\u21D2');
229        ESCAPE_STRINGS.put("&dArr", '\u21D3');
230        ESCAPE_STRINGS.put("&hArr", '\u21D4');
231        ESCAPE_STRINGS.put("&forall", '\u2200');
232        ESCAPE_STRINGS.put("&part", '\u2202');
233        ESCAPE_STRINGS.put("&exist", '\u2203');
234        ESCAPE_STRINGS.put("&empty", '\u2205');
235        ESCAPE_STRINGS.put("&nabla", '\u2207');
236        ESCAPE_STRINGS.put("&isin", '\u2208');
237        ESCAPE_STRINGS.put("&notin", '\u2209');
238        ESCAPE_STRINGS.put("&ni", '\u220B');
239        ESCAPE_STRINGS.put("&prod", '\u220F');
240        ESCAPE_STRINGS.put("&sum", '\u2211');
241        ESCAPE_STRINGS.put("&minus", '\u2212');
242        ESCAPE_STRINGS.put("&lowast", '\u2217');
243        ESCAPE_STRINGS.put("&radic", '\u221A');
244        ESCAPE_STRINGS.put("&prop", '\u221D');
245        ESCAPE_STRINGS.put("&infin", '\u221E');
246        ESCAPE_STRINGS.put("&ang", '\u2220');
247        ESCAPE_STRINGS.put("&and", '\u2227');
248        ESCAPE_STRINGS.put("&or", '\u2228');
249        ESCAPE_STRINGS.put("&cap", '\u2229');
250        ESCAPE_STRINGS.put("&cup", '\u222A');
251        ESCAPE_STRINGS.put("&int", '\u222B');
252        ESCAPE_STRINGS.put("&there4", '\u2234');
253        ESCAPE_STRINGS.put("&sim", '\u223C');
254        ESCAPE_STRINGS.put("&cong", '\u2245');
255        ESCAPE_STRINGS.put("&asymp", '\u2248');
256        ESCAPE_STRINGS.put("&ne", '\u2260');
257        ESCAPE_STRINGS.put("&equiv", '\u2261');
258        ESCAPE_STRINGS.put("&le", '\u2264');
259        ESCAPE_STRINGS.put("&ge", '\u2265');
260        ESCAPE_STRINGS.put("&sub", '\u2282');
261        ESCAPE_STRINGS.put("&sup", '\u2283');
262        ESCAPE_STRINGS.put("&nsub", '\u2284');
263        ESCAPE_STRINGS.put("&sube", '\u2286');
264        ESCAPE_STRINGS.put("&supe", '\u2287');
265        ESCAPE_STRINGS.put("&oplus", '\u2295');
266        ESCAPE_STRINGS.put("&otimes", '\u2297');
267        ESCAPE_STRINGS.put("&perp", '\u22A5');
268        ESCAPE_STRINGS.put("&sdot", '\u22C5');
269        ESCAPE_STRINGS.put("&lceil", '\u2308');
270        ESCAPE_STRINGS.put("&rceil", '\u2309');
271        ESCAPE_STRINGS.put("&lfloor", '\u230A');
272        ESCAPE_STRINGS.put("&rfloor", '\u230B');
273        ESCAPE_STRINGS.put("&lang", '\u2329');
274        ESCAPE_STRINGS.put("&rang", '\u232A');
275        ESCAPE_STRINGS.put("&loz", '\u25CA');
276        ESCAPE_STRINGS.put("&spades", '\u2660');
277        ESCAPE_STRINGS.put("&clubs", '\u2663');
278        ESCAPE_STRINGS.put("&hearts", '\u2665');
279        ESCAPE_STRINGS.put("&diams", '\u2666');
280        ESCAPE_STRINGS.put("&quot", '\u0022');
281        ESCAPE_STRINGS.put("&amp", '\u0026');
282        ESCAPE_STRINGS.put("&lt", '\u003C');
283        ESCAPE_STRINGS.put("&gt", '\u003E');
284        ESCAPE_STRINGS.put("&OElig", '\u0152');
285        ESCAPE_STRINGS.put("&oelig", '\u0153');
286        ESCAPE_STRINGS.put("&Scaron", '\u0160');
287        ESCAPE_STRINGS.put("&scaron", '\u0161');
288        ESCAPE_STRINGS.put("&Yuml", '\u0178');
289        ESCAPE_STRINGS.put("&circ", '\u02C6');
290        ESCAPE_STRINGS.put("&tilde", '\u02DC');
291        ESCAPE_STRINGS.put("&ensp", '\u2002');
292        ESCAPE_STRINGS.put("&emsp", '\u2003');
293        ESCAPE_STRINGS.put("&thinsp", '\u2009');
294        ESCAPE_STRINGS.put("&zwnj", '\u200C');
295        ESCAPE_STRINGS.put("&zwj", '\u200D');
296        ESCAPE_STRINGS.put("&lrm", '\u200E');
297        ESCAPE_STRINGS.put("&rlm", '\u200F');
298        ESCAPE_STRINGS.put("&ndash", '\u2013');
299        ESCAPE_STRINGS.put("&mdash", '\u2014');
300        ESCAPE_STRINGS.put("&lsquo", '\u2018');
301        ESCAPE_STRINGS.put("&rsquo", '\u2019');
302        ESCAPE_STRINGS.put("&sbquo", '\u201A');
303        ESCAPE_STRINGS.put("&ldquo", '\u201C');
304        ESCAPE_STRINGS.put("&rdquo", '\u201D');
305        ESCAPE_STRINGS.put("&bdquo", '\u201E');
306        ESCAPE_STRINGS.put("&dagger", '\u2020');
307        ESCAPE_STRINGS.put("&Dagger", '\u2021');
308        ESCAPE_STRINGS.put("&permil", '\u2030');
309        ESCAPE_STRINGS.put("&lsaquo", '\u2039');
310        ESCAPE_STRINGS.put("&rsaquo", '\u203A');
311        ESCAPE_STRINGS.put("&euro", '\u20AC');
312    }
313
314    /**
315     * Code to generate a short 'snippet' from either plain text or html text
316     *
317     * If the sync protocol can get plain text, that's great, but we'll still strip out extraneous
318     * whitespace.  If it's HTML, we'll 1) strip out tags, 2) turn entities into the appropriate
319     * characters, and 3) strip out extraneous whitespace, all in one pass
320     *
321     * Why not use an existing class?  The best answer is performance; yet another answer is
322     * correctness (e.g. Html.textFromHtml simply doesn't generate well-stripped text).  But
323     * performance is key; we frequently sync text that is 10K or (much) longer, yet we really only
324     * care about a small amount of text for the snippet.  So it's critically important that we just
325     * stop when we've gotten enough; existing methods that exist will go through the entire
326     * incoming string, at great (and useless, in this case) expense.
327     */
328
329    public static String makeSnippetFromHtmlText(String text) {
330        return makeSnippetFromText(text, true);
331    }
332
333    public static String makeSnippetFromPlainText(String text) {
334        return makeSnippetFromText(text, false);
335    }
336
337    /**
338     * Find the end of this tag; there are two alternatives: <tag .../> or <tag ...> ... </tag>
339     * @param htmlText some HTML text
340     * @param tag the HTML tag
341     * @param startPos the start position in the HTML text where the tag starts
342     * @return the position just before the end of the tag or -1 if not found
343     */
344    /*package*/ static int findTagEnd(String htmlText, String tag, int startPos) {
345        if (tag.endsWith(" ")) {
346            tag = tag.substring(0, tag.length() - 1);
347        }
348        int length = htmlText.length();
349        char prevChar = 0;
350        for (int i = startPos; i < length; i++) {
351            char c = htmlText.charAt(i);
352            if (c == '>') {
353               if (prevChar == '/') {
354                   return i - 1;
355               }
356               break;
357            }
358            prevChar = c;
359        }
360        // We didn't find /> at the end of the tag so find </tag>
361        return htmlText.indexOf("/" + tag, startPos);
362    }
363
364    public static String makeSnippetFromText(String text, boolean stripHtml) {
365        // Handle null and empty string
366        if (TextUtils.isEmpty(text)) return "";
367
368        final int length = text.length();
369        // Use char[] instead of StringBuilder purely for performance; fewer method calls, etc.
370        char[] buffer = new char[MAX_SNIPPET_LENGTH];
371        // skipCount is an array of a single int; that int is set inside stripHtmlEntity and is
372        // used to determine how many characters can be "skipped" due to the transformation of the
373        // entity to a single character.  When Java allows multiple return values, we can make this
374        // much cleaner :-)
375        int[] skipCount = new int[1];
376        int bufferCount = 0;
377        // Start with space as last character to avoid leading whitespace
378        char last = ' ';
379        // Indicates whether we're in the middle of an HTML tag
380        boolean inTag = false;
381
382        // Walk through the text until we're done with the input OR we've got a large enough snippet
383        for (int i = 0; i < length && bufferCount < MAX_SNIPPET_LENGTH; i++) {
384            char c = text.charAt(i);
385            if (stripHtml && !inTag && (c == '<')) {
386                // Find tags to strip; they will begin with <! or !- or </ or <letter
387                if (i < (length - 1)) {
388                    char peek = text.charAt(i + 1);
389                    if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) {
390                        inTag = true;
391                        // Strip content of title, script, style and applet tags
392                        if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) {
393                            String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1);
394                            String tagLowerCase = tag.toLowerCase();
395                            boolean stripContent = false;
396                            for (String stripTag: STRIP_TAGS) {
397                                if (tagLowerCase.startsWith(stripTag)) {
398                                    stripContent = true;
399                                    tag = tag.substring(0, stripTag.length());
400                                    break;
401                                }
402                            }
403                            if (stripContent) {
404                                // Look for the end of this tag
405                                int endTagPosition = findTagEnd(text, tag, i);
406                                if (endTagPosition < 0) {
407                                    break;
408                                } else {
409                                    i = endTagPosition;
410                                }
411                            }
412                        }
413                    }
414                }
415            } else if (stripHtml && inTag && (c == '>')) {
416                // Terminate stripping here
417                inTag = false;
418                continue;
419            }
420
421            if (inTag) {
422                // We just skip by everything while we're in a tag
423                continue;
424            } else if (stripHtml && (c == '&')) {
425                // Handle a possible HTML entity here
426                // We always get back a character to use; we also get back a "skip count",
427                // indicating how many characters were eaten from the entity
428                c = stripHtmlEntity(text, i, skipCount);
429                i += skipCount[0];
430            }
431
432            if (Character.isWhitespace(c) || (c == NON_BREAKING_SPACE_CHARACTER)) {
433                // The idea is to find the content in the message, not the whitespace, so we'll
434                // turn any combination of contiguous whitespace into a single space
435                if (last == ' ') {
436                    continue;
437                } else {
438                    // Make every whitespace character a simple space
439                    c = ' ';
440                }
441            } else if ((c == '-' || c == '=') && (last == c)) {
442                // Lots of messages (especially digests) have whole lines of --- or ===
443                // We'll get rid of those duplicates here
444                continue;
445            }
446
447            // After all that, maybe we've got a character for our snippet
448            buffer[bufferCount++] = c;
449            last = c;
450        }
451
452        // Lose trailing space and return our snippet
453        if ((bufferCount > 0) && (last == ' ')) {
454            bufferCount--;
455        }
456        return new String(buffer, 0, bufferCount);
457    }
458
459    static /*package*/ char stripHtmlEntity(String text, int pos, int[] skipCount) {
460        int length = text.length();
461        // Ugly, but we store our skip count in this array; we can't use a static here, because
462        // multiple threads might be calling in
463        skipCount[0] = 0;
464        // All entities are <= 8 characters long, so that's how far we'll look for one (+ & and ;)
465        int end = pos + 10;
466        String entity = null;
467        // Isolate the entity
468        for (int i = pos; (i < length) && (i < end); i++) {
469            if (text.charAt(i) == ';') {
470                entity = text.substring(pos, i);
471                break;
472            }
473        }
474        if (entity == null) {
475            // This wasn't really an HTML entity
476            return '&';
477        } else {
478            // Skip count is the length of the entity
479            Character mapping = ESCAPE_STRINGS.get(entity);
480            int entityLength = entity.length();
481            if (mapping != null) {
482                skipCount[0] = entityLength;
483                return mapping;
484            } else if ((entityLength > 2) && (entity.charAt(1) == '#')) {
485                // &#nn; means ascii nn (decimal) and &#xnn means ascii nn (hex)
486                char c = '?';
487                try {
488                    int i;
489                    if ((entity.charAt(2) == 'x') && (entityLength > 3)) {
490                        i = Integer.parseInt(entity.substring(3), 16);
491                    } else {
492                        i = Integer.parseInt(entity.substring(2));
493                    }
494                    c = (char)i;
495                } catch (NumberFormatException e) {
496                    // We'll just return the ? in this case
497                }
498                skipCount[0] = entityLength;
499                return c;
500            }
501        }
502        // Worst case, we return the original start character, ampersand
503        return '&';
504    }
505
506    /**
507     * Given a string of HTML text and a query containing any number of search terms, returns
508     * an HTML string in which those search terms are highlighted (intended for use in a WebView)
509     *
510     * @param text the HTML text to process
511     * @param query the search terms
512     * @return HTML text with the search terms highlighted
513     */
514    @VisibleForTesting
515    public static String highlightTermsInHtml(String text, String query) {
516        try {
517            return highlightTerms(text, query, true).toString();
518        } catch (IOException e) {
519            // Can't happen, but we must catch this
520            return text;
521        }
522    }
523
524    /**
525     * Given a string of plain text and a query containing any number of search terms, returns
526     * a CharSequence in which those search terms are highlighted (intended for use in a TextView)
527     *
528     * @param text the text to process
529     * @param query the search terms
530     * @return a CharSequence with the search terms highlighted
531     */
532    public static CharSequence highlightTermsInText(String text, String query) {
533        try {
534            return highlightTerms(text, query, false);
535        } catch (IOException e) {
536            // Can't happen, but we must catch this
537            return text;
538        }
539    }
540
541    static class SearchTerm {
542        final String mTerm;
543        final String mTermLowerCase;
544        final int mLength;
545        int mMatchLength = 0;
546        int mMatchStart = -1;
547
548        SearchTerm(String term) {
549            mTerm = term;
550            mTermLowerCase = term.toLowerCase();
551            mLength = term.length();
552        }
553    }
554
555    /**
556     * Generate a version of the incoming text in which all search terms in a query are highlighted.
557     * If the input is HTML, we return a StringBuilder with additional markup as required
558     * If the input is text, we return a SpannableStringBuilder with additional spans as required
559     *
560     * @param text the text to be processed
561     * @param query the query, which can contain multiple terms separated by whitespace
562     * @param html whether or not the text to be processed is HTML
563     * @return highlighted text
564     *
565     * @throws IOException as Appendable requires this
566     */
567    public static CharSequence highlightTerms(String text, String query, boolean html)
568            throws IOException {
569        // Handle null and empty string
570        if (TextUtils.isEmpty(text)) return "";
571        final int length = text.length();
572
573        // Break up the query into search terms
574        ArrayList<SearchTerm> terms = new ArrayList<SearchTerm>();
575        if (query != null) {
576            StringTokenizer st = new StringTokenizer(query);
577            while (st.hasMoreTokens()) {
578                terms.add(new SearchTerm(st.nextToken()));
579            }
580        }
581
582        // Our appendable depends on whether we're building HTML text (for webview) or spannable
583        // text (for UI)
584        final Appendable sb = html ? new StringBuilder() : new SpannableStringBuilder();
585        // Indicates whether we're in the middle of an HTML tag
586        boolean inTag = false;
587        // The position of the last input character copied to output
588        int lastOut = -1;
589
590        // Walk through the text until we're done with the input
591        // Just copy any HTML tags directly into the output; search for terms in the remaining text
592        for (int i = 0; i < length; i++) {
593            char chr = text.charAt(i);
594            if (html) {
595                if (!inTag && (chr == '<')) {
596                    // Find tags; they will begin with <! or !- or </ or <letter
597                    if (i < (length - 1)) {
598                        char peek = text.charAt(i + 1);
599                        if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) {
600                            inTag = true;
601                            // Skip content of title, script, style and applet tags
602                            if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) {
603                                String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1);
604                                String tagLowerCase = tag.toLowerCase();
605                                boolean stripContent = false;
606                                for (String stripTag: STRIP_TAGS) {
607                                    if (tagLowerCase.startsWith(stripTag)) {
608                                        stripContent = true;
609                                        tag = tag.substring(0, stripTag.length());
610                                        break;
611                                    }
612                                }
613                                if (stripContent) {
614                                    // Look for the end of this tag
615                                    int endTagPosition = findTagEnd(text, tag, i);
616                                    if (endTagPosition < 0) {
617                                        sb.append(text.substring(i));
618                                        break;
619                                    } else {
620                                        sb.append(text.substring(i, endTagPosition - 1));
621                                        i = endTagPosition - 1;
622                                        chr = text.charAt(i);
623                                    }
624                                }
625                            }
626                        }
627                    }
628                } else if (inTag && (chr == '>')) {
629                    inTag = false;
630                }
631
632                if (inTag) {
633                    sb.append(chr);
634                    continue;
635                }
636            }
637
638            // After all that, we've got some "body" text
639            char chrLowerCase = Character.toLowerCase(chr);
640            // Whether or not the current character should be appended to the output; we inhibit
641            // this while any search terms match
642            boolean appendNow = true;
643            // Look through search terms for matches
644            for (SearchTerm t: terms) {
645                if (chrLowerCase == t.mTermLowerCase.charAt(t.mMatchLength)) {
646                    if (t.mMatchLength++ == 0) {
647                        // New match start
648                        t.mMatchStart = i;
649                    }
650                    if (t.mMatchLength == t.mLength) {
651                        String matchText = text.substring(t.mMatchStart, t.mMatchStart + t.mLength);
652                        // Completed match; add highlight and reset term
653                        if (t.mMatchStart <= lastOut) {
654                            matchText = text.substring(lastOut + 1, i + 1);
655                        }
656                        /*else*/
657                        if (matchText.length() == 0) {} else
658                        if (html) {
659                            sb.append("<span style=\"background-color: " + HIGHLIGHT_COLOR_STRING +
660                                    "\">");
661                            sb.append(matchText);
662                            sb.append("</span>");
663                        } else {
664                            SpannableString highlightSpan = new SpannableString(matchText);
665                            highlightSpan.setSpan(new BackgroundColorSpan(HIGHLIGHT_COLOR_INT), 0,
666                                    highlightSpan.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
667                            sb.append(highlightSpan);
668                        }
669                        lastOut = t.mMatchStart + t.mLength - 1;
670                        t.mMatchLength = 0;
671                        t.mMatchStart = -1;
672                    }
673                    appendNow = false;
674                } else {
675                    if (t.mMatchStart >= 0) {
676                        // We're no longer matching; check for other matches in progress
677                        int leastOtherStart = -1;
678                        for (SearchTerm ot: terms) {
679                            // Save away the lowest match start for other search terms
680                            if ((ot != t) && (ot.mMatchStart >= 0) && ((leastOtherStart < 0) ||
681                                    (ot.mMatchStart <= leastOtherStart))) {
682                                leastOtherStart = ot.mMatchStart;
683                            }
684                        }
685                        int matchEnd = t.mMatchStart + t.mMatchLength;
686                        if (leastOtherStart < 0 || leastOtherStart > matchEnd) {
687                            // Append the whole thing
688                            if (t.mMatchStart > lastOut) {
689                                sb.append(text.substring(t.mMatchStart, matchEnd));
690                                lastOut = matchEnd;
691                            }
692                        } else if (leastOtherStart == t.mMatchStart) {
693                            // Ok to append the current char
694                        } else if (leastOtherStart < t.mMatchStart) {
695                            // We're already covered by another search term, so don't append
696                            appendNow = false;
697                        } else if (t.mMatchStart > lastOut) {
698                            // Append the piece of our term that's not already covered
699                            sb.append(text.substring(t.mMatchStart, leastOtherStart));
700                            lastOut = leastOtherStart;
701                        }
702                    }
703                    // Reset this term
704                    t.mMatchLength = 0;
705                    t.mMatchStart = -1;
706                }
707            }
708
709            if (appendNow) {
710                sb.append(chr);
711                lastOut = i;
712            }
713        }
714
715        return (CharSequence)sb;
716   }
717
718    /**
719     * Determine whether two Strings (either of which might be null) are the same; this is true
720     * when both are null or both are Strings that are equal.
721     */
722    public static boolean stringOrNullEquals(String a, String b) {
723        if (a == null && b == null) return true;
724        if (a != null && b != null && a.equals(b)) return true;
725        return false;
726    }
727
728}
729