1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.android.util;
18
19import java.util.ArrayList;
20import java.util.HashMap;
21import java.util.LinkedHashMap;
22import java.util.regex.Matcher;
23import java.util.regex.Pattern;
24import java.util.Set;
25import java.util.List;
26
27/**
28 *
29 * Logic for parsing a text message typed by the user looking for smileys,
30 * urls, acronyms,formatting (e.g., '*'s for bold), me commands
31 * (e.g., "/me is asleep"), and punctuation.
32 *
33 * It constructs an array, which breaks the text up into its
34 * constituent pieces, which we return to the client.
35 *
36 */
37public abstract class AbstractMessageParser {
38/**
39 * Interface representing the set of resources needed by a message parser
40 *
41 * @author jessan (Jessan Hutchison-Quillian)
42 */
43  public static interface Resources {
44
45    /** Get the known set of URL schemes. */
46    public Set<String> getSchemes();
47
48    /** Get the possible values for the last part of a domain name.
49     *  Values are expected to be reversed in the Trie.
50     */
51    public TrieNode getDomainSuffixes();
52
53    /** Get the smileys accepted by the parser. */
54    public TrieNode getSmileys();
55
56    /** Get the acronyms accepted by the parser. */
57    public TrieNode getAcronyms();
58  }
59
60  /**
61   * Subclasses must define the schemes, domains, smileys and acronyms
62   * that are necessary for parsing
63   */
64  protected abstract Resources getResources();
65
66  /** Music note that indicates user is listening to a music track. */
67  public static final String musicNote = "\u266B ";
68
69  private String text;
70  private int nextChar;
71  private int nextClass;
72  private ArrayList<Part> parts;
73  private ArrayList<Token> tokens;
74  private HashMap<Character,Format> formatStart;
75  private boolean parseSmilies;
76  private boolean parseAcronyms;
77  private boolean parseFormatting;
78  private boolean parseUrls;
79  private boolean parseMeText;
80  private boolean parseMusic;
81
82  /**
83   * Create a message parser to parse urls, formatting, acronyms, smileys,
84   * /me text and  music
85   *
86   * @param text the text to parse
87   */
88  public AbstractMessageParser(String text) {
89    this(text, true, true, true, true, true, true);
90  }
91
92  /**
93   * Create a message parser, specifying the kinds of text to parse
94   *
95   * @param text the text to parse
96   *
97   */
98  public AbstractMessageParser(String text, boolean parseSmilies,
99      boolean parseAcronyms, boolean parseFormatting, boolean parseUrls,
100      boolean parseMusic, boolean parseMeText) {
101    this.text = text;
102    this.nextChar = 0;
103    this.nextClass = 10;
104    this.parts = new ArrayList<Part>();
105    this.tokens = new ArrayList<Token>();
106    this.formatStart = new HashMap<Character,Format>();
107    this.parseSmilies = parseSmilies;
108    this.parseAcronyms = parseAcronyms;
109    this.parseFormatting = parseFormatting;
110    this.parseUrls = parseUrls;
111    this.parseMusic = parseMusic;
112    this.parseMeText = parseMeText;
113  }
114
115  /** Returns the raw text being parsed. */
116  public final String getRawText() { return text; }
117
118  /** Return the number of parts. */
119  public final int getPartCount() { return parts.size(); }
120
121  /** Return the part at the given index. */
122  public final Part getPart(int index) { return parts.get(index); }
123
124  /** Return the list of parts from the parsed text */
125  public final List<Part> getParts() { return parts; }
126
127  /** Parses the text string into an internal representation. */
128  public void parse() {
129    // Look for music track (of which there would be only one and it'll be the
130    // first token)
131    if (parseMusicTrack()) {
132      buildParts(null);
133      return;
134    }
135
136    // Look for me commands.
137    String meText = null;
138    if (parseMeText && text.startsWith("/me") && (text.length() > 3) &&
139        Character.isWhitespace(text.charAt(3))) {
140      meText = text.substring(0, 4);
141      text = text.substring(4);
142    }
143
144    // Break the text into tokens.
145    boolean wasSmiley = false;
146    while (nextChar < text.length()) {
147      if (!isWordBreak(nextChar)) {
148        if (!wasSmiley || !isSmileyBreak(nextChar)) {
149          throw new AssertionError("last chunk did not end at word break");
150        }
151      }
152
153      if (parseSmiley()) {
154        wasSmiley = true;
155      } else {
156        wasSmiley = false;
157
158        if (!parseAcronym() && !parseURL() && !parseFormatting()) {
159          parseText();
160        }
161      }
162    }
163
164    // Trim the whitespace before and after media components.
165    for (int i = 0; i < tokens.size(); ++i) {
166      if (tokens.get(i).isMedia()) {
167        if ((i > 0) && (tokens.get(i - 1) instanceof Html)) {
168          ((Html)tokens.get(i - 1)).trimLeadingWhitespace();
169        }
170        if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) {
171          ((Html)tokens.get(i + 1)).trimTrailingWhitespace();
172        }
173      }
174    }
175
176    // Remove any empty html tokens.
177    for (int i = 0; i < tokens.size(); ++i) {
178      if (tokens.get(i).isHtml() &&
179          (tokens.get(i).toHtml(true).length() == 0)) {
180        tokens.remove(i);
181        --i;  // visit this index again
182      }
183    }
184
185    buildParts(meText);
186  }
187
188  /**
189   * Get a the appropriate Token for a given URL
190   *
191   * @param text the anchor text
192   * @param url the url
193   *
194   */
195  public static Token tokenForUrl(String url, String text) {
196    if(url == null) {
197      return null;
198    }
199
200    //Look for video links
201    Video video = Video.matchURL(url, text);
202    if (video != null) {
203      return video;
204    }
205
206    // Look for video links.
207    YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text);
208    if (ytVideo != null) {
209      return ytVideo;
210    }
211
212    // Look for photo links.
213    Photo photo = Photo.matchURL(url, text);
214    if (photo != null) {
215      return photo;
216    }
217
218    // Look for photo links.
219    FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text);
220    if (flickrPhoto != null) {
221      return flickrPhoto;
222    }
223
224    //Not media, so must be a regular URL
225    return new Link(url, text);
226  }
227
228  /**
229   * Builds the parts list.
230   *
231   * @param meText any meText parsed from the message
232   */
233  private void buildParts(String meText) {
234    for (int i = 0; i < tokens.size(); ++i) {
235      Token token = tokens.get(i);
236      if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) {
237        parts.add(new Part());
238      }
239      lastPart().add(token);
240    }
241
242    // The first part inherits the meText of the line.
243    if (parts.size() > 0) {
244      parts.get(0).setMeText(meText);
245    }
246  }
247
248  /** Returns the last part in the list. */
249  private Part lastPart() { return parts.get(parts.size() - 1); }
250
251  /**
252   * Looks for a music track (\u266B is first character, everything else is
253   * track info).
254   */
255  private boolean parseMusicTrack() {
256
257    if (parseMusic && text.startsWith(musicNote)) {
258      addToken(new MusicTrack(text.substring(musicNote.length())));
259      nextChar = text.length();
260      return true;
261    }
262    return false;
263  }
264
265  /** Consumes all of the text in the next word . */
266  private void parseText() {
267    StringBuilder buf = new StringBuilder();
268    int start = nextChar;
269    do {
270      char ch = text.charAt(nextChar++);
271      switch (ch) {
272        case '<':  buf.append("&lt;"); break;
273        case '>':  buf.append("&gt;"); break;
274        case '&':  buf.append("&amp;"); break;
275        case '"':  buf.append("&quot;"); break;
276        case '\'':  buf.append("&apos;"); break;
277        case '\n':  buf.append("<br>"); break;
278        default:  buf.append(ch); break;
279      }
280    } while (!isWordBreak(nextChar));
281
282    addToken(new Html(text.substring(start, nextChar), buf.toString()));
283  }
284
285  /**
286   * Looks for smileys (e.g., ":)") in the text.  The set of known smileys is
287   * loaded from a file into a trie at server start.
288   */
289  private boolean parseSmiley() {
290    if(!parseSmilies) {
291      return false;
292    }
293    TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar,
294                                  true);
295    if (match == null) {
296      return false;
297    } else {
298      int previousCharClass = getCharClass(nextChar - 1);
299      int nextCharClass = getCharClass(nextChar + match.getText().length());
300      if ((previousCharClass == 2 || previousCharClass == 3)
301          && (nextCharClass == 2 || nextCharClass == 3)) {
302        return false;
303      }
304      addToken(new Smiley(match.getText()));
305      nextChar += match.getText().length();
306      return true;
307    }
308  }
309
310  /** Looks for acronyms (e.g., "lol") in the text.
311   */
312  private boolean parseAcronym() {
313    if(!parseAcronyms) {
314      return false;
315    }
316    TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar);
317    if (match == null) {
318      return false;
319    } else {
320      addToken(new Acronym(match.getText(), match.getValue()));
321      nextChar += match.getText().length();
322      return true;
323    }
324  }
325
326  /** Determines if this is an allowable domain character. */
327  private boolean isDomainChar(char c) {
328    return c == '-' || Character.isLetter(c) || Character.isDigit(c);
329  }
330
331  /** Determines if the given string is a valid domain. */
332  private boolean isValidDomain(String domain) {
333    // For hostnames, check that it ends with a known domain suffix
334    if (matches(getResources().getDomainSuffixes(), reverse(domain))) {
335      return true;
336    }
337    return false;
338  }
339
340  /**
341   * Looks for a URL in two possible forms:  either a proper URL with a known
342   * scheme or a domain name optionally followed by a path, query, or query.
343   */
344  private boolean parseURL() {
345    // Make sure this is a valid place to start a URL.
346    if (!parseUrls || !isURLBreak(nextChar)) {
347      return false;
348    }
349
350    int start = nextChar;
351
352    // Search for the first block of letters.
353    int index = start;
354    while ((index < text.length()) && isDomainChar(text.charAt(index))) {
355      index += 1;
356    }
357
358    String url = "";
359    boolean done = false;
360
361    if (index == text.length()) {
362      return false;
363    } else if (text.charAt(index) == ':') {
364      // Make sure this is a known scheme.
365      String scheme = text.substring(nextChar, index);
366      if (!getResources().getSchemes().contains(scheme)) {
367        return false;
368      }
369    } else if (text.charAt(index) == '.') {
370      // Search for the end of the domain name.
371      while (index < text.length()) {
372        char ch = text.charAt(index);
373        if ((ch != '.') && !isDomainChar(ch)) {
374          break;
375        } else {
376          index += 1;
377        }
378      }
379
380      // Make sure the domain name has a valid suffix.  Since tries look for
381      // prefix matches, we reverse all the strings to get suffix comparisons.
382      String domain = text.substring(nextChar, index);
383      if (!isValidDomain(domain)) {
384        return false;
385      }
386
387      // Search for a port.  We deal with this specially because a colon can
388      // also be a punctuation character.
389      if ((index + 1 < text.length()) && (text.charAt(index) == ':')) {
390        char ch = text.charAt(index + 1);
391        if (Character.isDigit(ch)) {
392          index += 1;
393          while ((index < text.length()) &&
394                 Character.isDigit(text.charAt(index))) {
395            index += 1;
396          }
397        }
398      }
399
400      // The domain name should be followed by end of line, whitespace,
401      // punctuation, or a colon, slash, question, or hash character.  The
402      // tricky part here is that some URL characters are also punctuation, so
403      // we need to distinguish them.  Since we looked for ports above, a colon
404      // is always punctuation here.  To distinguish '?' cases, we look at the
405      // character that follows it.
406      if (index == text.length()) {
407        done = true;
408      } else {
409        char ch = text.charAt(index);
410        if (ch == '?') {
411          // If the next character is whitespace or punctuation (or missing),
412          // then this question mark looks like punctuation.
413          if (index + 1 == text.length()) {
414            done = true;
415          } else {
416            char ch2 = text.charAt(index + 1);
417            if (Character.isWhitespace(ch2) || isPunctuation(ch2)) {
418              done = true;
419            }
420          }
421        } else if (isPunctuation(ch)) {
422          done = true;
423        } else if (Character.isWhitespace(ch)) {
424          done = true;
425        } else if ((ch == '/') || (ch == '#')) {
426          // In this case, the URL is not done.  We will search for the end of
427          // it below.
428        } else {
429          return false;
430        }
431      }
432
433      // We will assume the user meant HTTP.  (One weird case is where they
434      // type a port of 443.  That could mean HTTPS, but they might also want
435      // HTTP.  We'll let them specify if they don't want HTTP.)
436      url = "http://";
437    } else {
438      return false;
439    }
440
441    // If the URL is not done, search for the end, which is just before the
442    // next whitespace character.
443    if (!done) {
444      while ((index < text.length()) &&
445             !Character.isWhitespace(text.charAt(index))) {
446        index += 1;
447      }
448    }
449
450    String urlText = text.substring(start, index);
451    url += urlText;
452
453    // Figure out the appropriate token type.
454    addURLToken(url, urlText);
455
456    nextChar = index;
457    return true;
458  }
459
460  /**
461   * Adds the appropriate token for the given URL.  This might be a simple
462   * link or it might be a recognized media type.
463   */
464  private void addURLToken(String url, String text) {
465     addToken(tokenForUrl(url, text));
466  }
467
468  /**
469   * Deal with formatting characters.
470   *
471   * Parsing is as follows:
472   *  - Treat all contiguous strings of formatting characters as one block.
473   *    (This method processes one block.)
474   *  - Only a single instance of a particular format character within a block
475   *    is used to determine whether to turn on/off that type of formatting;
476   *    other instances simply print the character itself.
477   *  - If the format is to be turned on, we use the _first_ instance; if it
478   *    is to be turned off, we use the _last_ instance (by appending the
479   *    format.)
480   *
481   * Example:
482   *   **string** turns into <b>*string*</b>
483   */
484  private boolean parseFormatting() {
485    if(!parseFormatting) {
486      return false;
487    }
488    int endChar = nextChar;
489    while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) {
490      endChar += 1;
491    }
492
493    if ((endChar == nextChar) || !isWordBreak(endChar)) {
494      return false;
495    }
496
497    // Keeps track of whether we've seen a character (in map if we've seen it)
498    // and whether we should append a closing format token (if value in
499    // map is TRUE).  Linked hashmap for consistent ordering.
500    LinkedHashMap<Character, Boolean> seenCharacters =
501        new LinkedHashMap<Character, Boolean>();
502
503    for (int index = nextChar; index < endChar; ++index) {
504      char ch = text.charAt(index);
505      Character key = Character.valueOf(ch);
506      if (seenCharacters.containsKey(key)) {
507        // Already seen this character, just append an unmatched token, which
508        // will print plaintext character
509        addToken(new Format(ch, false));
510      } else {
511        Format start = formatStart.get(key);
512        if (start != null) {
513          // Match the start token, and ask an end token to be appended
514          start.setMatched(true);
515          formatStart.remove(key);
516          seenCharacters.put(key, Boolean.TRUE);
517        } else {
518          // Append start token
519          start = new Format(ch, true);
520          formatStart.put(key, start);
521          addToken(start);
522          seenCharacters.put(key, Boolean.FALSE);
523        }
524      }
525    }
526
527    // Append any necessary end tokens
528    for (Character key : seenCharacters.keySet()) {
529      if (seenCharacters.get(key) == Boolean.TRUE) {
530        Format end = new Format(key.charValue(), false);
531        end.setMatched(true);
532        addToken(end);
533      }
534    }
535
536    nextChar = endChar;
537    return true;
538  }
539
540  /** Determines whether the given index could be a possible word break. */
541  private boolean isWordBreak(int index) {
542    return getCharClass(index - 1) != getCharClass(index);
543  }
544
545  /** Determines whether the given index could be a possible smiley break. */
546  private boolean isSmileyBreak(int index) {
547    if (index > 0 && index < text.length()) {
548      if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) {
549        return true;
550      }
551    }
552
553    return false;
554  }
555
556  /**
557   * Verifies that the character before the given index is end of line,
558   * whitespace, or punctuation.
559   */
560  private boolean isURLBreak(int index) {
561    switch (getCharClass(index - 1)) {
562      case 2:
563      case 3:
564      case 4:
565        return false;
566
567      case 0:
568      case 1:
569      default:
570        return true;
571    }
572  }
573
574  /** Returns the class for the character at the given index. */
575  private int getCharClass(int index) {
576    if ((index < 0) || (text.length() <= index)) {
577      return 0;
578    }
579
580    char ch = text.charAt(index);
581    if (Character.isWhitespace(ch)) {
582      return 1;
583    } else if (Character.isLetter(ch)) {
584      return 2;
585    } else if (Character.isDigit(ch)) {
586      return 3;
587    } else if (isPunctuation(ch)) {
588      // For punctuation, we return a unique value every time so that they are
589      // always different from any other character.  Punctuation should always
590      // be considered a possible word break.
591      return ++nextClass;
592    } else {
593      return 4;
594    }
595  }
596
597  /**
598   * Returns true if <code>c1</code> could be the last character of
599   * a smiley and <code>c2</code> could be the first character of
600   * a different smiley, if {@link #isWordBreak} would not already
601   * recognize that this is possible.
602   */
603  private static boolean isSmileyBreak(char c1, char c2) {
604    switch (c1) {
605      /*
606       * These characters can end smileys, but don't normally end words.
607       */
608      case '$': case '&': case '*': case '+': case '-':
609      case '/': case '<': case '=': case '>': case '@':
610      case '[': case '\\': case ']': case '^': case '|':
611      case '}': case '~':
612        switch (c2) {
613          /*
614           * These characters can begin smileys, but don't normally
615           * begin words.
616           */
617          case '#': case '$': case '%': case '*': case '/':
618          case '<': case '=': case '>': case '@': case '[':
619          case '\\': case '^': case '~':
620            return true;
621        }
622    }
623
624    return false;
625  }
626
627  /** Determines whether the given character is punctuation. */
628  private static boolean isPunctuation(char ch) {
629    switch (ch) {
630      case '.': case ',': case '"': case ':': case ';':
631      case '?': case '!': case '(': case ')':
632        return true;
633
634      default:
635        return false;
636    }
637  }
638
639  /**
640   * Determines whether the given character is the beginning or end of a
641   * section with special formatting.
642   */
643  private static boolean isFormatChar(char ch) {
644    switch (ch) {
645      case '*': case '_': case '^':
646        return true;
647
648      default:
649        return false;
650    }
651  }
652
653  /** Represents a unit of parsed output. */
654  public static abstract class Token {
655    public enum Type {
656
657      HTML ("html"),
658      FORMAT ("format"),  // subtype of HTML
659      LINK ("l"),
660      SMILEY ("e"),
661      ACRONYM ("a"),
662      MUSIC ("m"),
663      GOOGLE_VIDEO ("v"),
664      YOUTUBE_VIDEO ("yt"),
665      PHOTO ("p"),
666      FLICKR ("f");
667
668      //stringreps for HTML and FORMAT don't really matter
669      //because they don't define getInfo(), which is where it is used
670      //For the other types, code depends on their stringreps
671      private String stringRep;
672
673      Type(String stringRep) {
674        this.stringRep = stringRep;
675      }
676
677      /** {@inheritDoc} */
678      public String toString() {
679        return this.stringRep;
680      }
681    }
682
683    protected Type type;
684    protected String text;
685
686    protected Token(Type type, String text) {
687      this.type = type;
688      this.text = text;
689    }
690
691    /** Returns the type of the token. */
692    public Type getType() { return type; }
693
694    /**
695     * Get the relevant information about a token
696     *
697     * @return a list of strings representing the token, not null
698     *         The first item is always a string representation of the type
699     */
700    public List<String> getInfo() {
701      List<String> info = new ArrayList<String>();
702      info.add(getType().toString());
703      return info;
704    }
705
706    /** Returns the raw text of the token. */
707    public String getRawText() { return text; }
708
709    public boolean isMedia() { return false; }
710    public abstract boolean isHtml();
711    public boolean isArray() { return !isHtml(); }
712
713    public String toHtml(boolean caps) { throw new AssertionError("not html"); }
714
715    // The token can change the caps of the text after that point.
716    public boolean controlCaps() { return false; }
717    public boolean setCaps() { return false; }
718  }
719
720  /** Represents a simple string of html text. */
721  public static class Html extends Token {
722    private String html;
723
724    public Html(String text, String html) {
725      super(Type.HTML, text);
726      this.html = html;
727    }
728
729    public boolean isHtml() { return true; }
730    public String toHtml(boolean caps) {
731      return caps ? html.toUpperCase() : html;
732    }
733    /**
734     * Not supported. Info should not be needed for this type
735     */
736    public List<String> getInfo() {
737      throw new UnsupportedOperationException();
738    }
739
740    public void trimLeadingWhitespace() {
741      text = trimLeadingWhitespace(text);
742      html = trimLeadingWhitespace(html);
743    }
744
745    public void trimTrailingWhitespace() {
746      text = trimTrailingWhitespace(text);
747      html = trimTrailingWhitespace(html);
748    }
749
750    private static String trimLeadingWhitespace(String text) {
751      int index = 0;
752      while ((index < text.length()) &&
753             Character.isWhitespace(text.charAt(index))) {
754        ++index;
755      }
756      return text.substring(index);
757    }
758
759    public static String trimTrailingWhitespace(String text) {
760      int index = text.length();
761      while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) {
762        --index;
763      }
764      return text.substring(0, index);
765    }
766  }
767
768  /** Represents a music track token at the beginning. */
769  public static class MusicTrack extends Token {
770    private String track;
771
772    public MusicTrack(String track) {
773      super(Type.MUSIC, track);
774      this.track = track;
775    }
776
777    public String getTrack() { return track; }
778
779    public boolean isHtml() { return false; }
780
781    public List<String> getInfo() {
782      List<String> info = super.getInfo();
783      info.add(getTrack());
784      return info;
785    }
786  }
787
788  /** Represents a link that was found in the input. */
789  public static class Link extends Token {
790    private String url;
791
792    public Link(String url, String text) {
793      super(Type.LINK, text);
794      this.url = url;
795    }
796
797    public String getURL() { return url; }
798
799    public boolean isHtml() { return false; }
800
801    public List<String> getInfo() {
802      List<String> info = super.getInfo();
803      info.add(getURL());
804      info.add(getRawText());
805      return info;
806    }
807  }
808
809  /** Represents a link to a Google Video. */
810  public static class Video extends Token {
811    /** Pattern for a video URL. */
812    private static final Pattern URL_PATTERN = Pattern.compile(
813        "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?"
814        + ".*?\\bdocid=(-?\\d+).*");
815
816    private String docid;
817
818    public Video(String docid, String text) {
819      super(Type.GOOGLE_VIDEO, text);
820      this.docid = docid;
821    }
822
823    public String getDocID() { return docid; }
824
825    public boolean isHtml() { return false; }
826    public boolean isMedia() { return true; }
827
828    /** Returns a Video object if the given url is to a video. */
829    public static Video matchURL(String url, String text) {
830      Matcher m = URL_PATTERN.matcher(url);
831      if (m.matches()) {
832        return new Video(m.group(1), text);
833      } else {
834        return null;
835      }
836    }
837
838    public List<String> getInfo() {
839      List<String> info = super.getInfo();
840      info.add(getRssUrl(docid));
841      info.add(getURL(docid));
842      return info;
843    }
844
845    /** Returns the URL for the RSS description of the given video. */
846    public static String getRssUrl(String docid) {
847      return "http://video.google.com/videofeed"
848             + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid;
849    }
850
851    /** (For testing purposes:) Returns a video URL with the given parts.  */
852    public static String getURL(String docid) {
853      return getURL(docid, null);
854    }
855
856    /** (For testing purposes:) Returns a video URL with the given parts.  */
857    public static String getURL(String docid, String extraParams) {
858      if (extraParams == null) {
859        extraParams = "";
860      } else if (extraParams.length() > 0) {
861        extraParams += "&";
862      }
863      return "http://video.google.com/videoplay?" + extraParams
864             + "docid=" + docid;
865    }
866  }
867
868  /** Represents a link to a YouTube video. */
869  public static class YouTubeVideo extends Token {
870    /** Pattern for a video URL. */
871    private static final Pattern URL_PATTERN = Pattern.compile(
872        "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?"
873        + ".*\\bv=([-_a-zA-Z0-9=]+).*");
874
875    private String docid;
876
877    public YouTubeVideo(String docid, String text) {
878      super(Type.YOUTUBE_VIDEO, text);
879      this.docid = docid;
880    }
881
882    public String getDocID() { return docid; }
883
884    public boolean isHtml() { return false; }
885    public boolean isMedia() { return true; }
886
887    /** Returns a Video object if the given url is to a video. */
888    public static YouTubeVideo matchURL(String url, String text) {
889      Matcher m = URL_PATTERN.matcher(url);
890      if (m.matches()) {
891        return new YouTubeVideo(m.group(1), text);
892      } else {
893        return null;
894      }
895    }
896
897    public List<String> getInfo() {
898      List<String> info = super.getInfo();
899      info.add(getRssUrl(docid));
900      info.add(getURL(docid));
901      return info;
902    }
903
904    /** Returns the URL for the RSS description of the given video. */
905    public static String getRssUrl(String docid) {
906      return "http://youtube.com/watch?v=" + docid;
907    }
908
909    /** (For testing purposes:) Returns a video URL with the given parts.  */
910    public static String getURL(String docid) {
911      return getURL(docid, null);
912    }
913
914    /** (For testing purposes:) Returns a video URL with the given parts.  */
915    public static String getURL(String docid, String extraParams) {
916      if (extraParams == null) {
917        extraParams = "";
918      } else if (extraParams.length() > 0) {
919        extraParams += "&";
920      }
921      return "http://youtube.com/watch?" + extraParams + "v=" + docid;
922    }
923
924    /** (For testing purposes:) Returns a video URL with the given parts.
925      * @param http If true, includes http://
926      * @param prefix If non-null/non-blank, adds to URL before youtube.com.
927      *   (e.g., prefix="br." --> "br.youtube.com")
928      */
929    public static String getPrefixedURL(boolean http, String prefix,
930                                        String docid, String extraParams) {
931      String protocol = "";
932
933      if (http) {
934        protocol = "http://";
935      }
936
937      if (prefix == null) {
938        prefix = "";
939      }
940
941      if (extraParams == null) {
942        extraParams = "";
943      } else if (extraParams.length() > 0) {
944        extraParams += "&";
945      }
946
947      return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" +
948              docid;
949    }
950  }
951
952  /** Represents a link to a Picasa photo or album. */
953  public static class Photo extends Token {
954    /** Pattern for an album or photo URL. */
955    // TODO (katyarogers) searchbrowse includes search lists and tags,
956    // it follows a different pattern than albums - would be nice to add later
957    private static final Pattern URL_PATTERN = Pattern.compile(
958        "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?");
959
960    private String user;
961    private String album;
962    private String photo;  // null for albums
963
964    public Photo(String user, String album, String photo, String text) {
965      super(Type.PHOTO, text);
966      this.user = user;
967      this.album = album;
968      this.photo = photo;
969    }
970
971    public String getUser() { return user; }
972    public String getAlbum() { return album; }
973    public String getPhoto() { return photo; }
974
975    public boolean isHtml() { return false; }
976    public boolean isMedia() { return true; }
977
978    /** Returns a Photo object if the given url is to a photo or album. */
979    public static Photo matchURL(String url, String text) {
980      Matcher m = URL_PATTERN.matcher(url);
981      if (m.matches()) {
982        return new Photo(m.group(1), m.group(2), m.group(3), text);
983      } else {
984        return null;
985      }
986    }
987
988    public List<String> getInfo() {
989      List<String> info = super.getInfo();
990      info.add(getRssUrl(getUser()));
991      info.add(getAlbumURL(getUser(), getAlbum()));
992      if (getPhoto() != null) {
993        info.add(getPhotoURL(getUser(), getAlbum(), getPhoto()));
994      } else {
995        info.add((String)null);
996      }
997      return info;
998    }
999
1000    /** Returns the URL for the RSS description of the user's albums. */
1001    public static String getRssUrl(String user) {
1002      return "http://picasaweb.google.com/data/feed/api/user/" + user +
1003        "?category=album&alt=rss";
1004    }
1005
1006    /** Returns the URL for an album. */
1007    public static String getAlbumURL(String user, String album) {
1008      return "http://picasaweb.google.com/" + user + "/" + album;
1009    }
1010
1011    /** Returns the URL for a particular photo. */
1012    public static String getPhotoURL(String user, String album, String photo) {
1013      return "http://picasaweb.google.com/" + user + "/" + album + "/photo#"
1014             + photo;
1015    }
1016  }
1017
1018  /** Represents a link to a Flickr photo or album. */
1019  public static class FlickrPhoto extends Token {
1020    /** Pattern for a user album or photo URL. */
1021    private static final Pattern URL_PATTERN = Pattern.compile(
1022        "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*");
1023    private static final Pattern GROUPING_PATTERN = Pattern.compile(
1024        "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" +
1025        "([^/?#&]+)/?");
1026
1027    private static final String SETS = "sets";
1028    private static final String TAGS = "tags";
1029
1030    private String user;
1031    private String photo;      // null for user album
1032    private String grouping;   // either "tags" or "sets"
1033    private String groupingId; // sets or tags identifier
1034
1035    public FlickrPhoto(String user, String photo, String grouping,
1036                       String groupingId, String text) {
1037      super(Type.FLICKR, text);
1038
1039      /* System wide tags look like the URL to a Flickr user. */
1040      if (!TAGS.equals(user)) {
1041        this.user = user;
1042        // Don't consider slide show URL a photo
1043        this.photo = (!"show".equals(photo) ? photo : null);
1044        this.grouping = grouping;
1045        this.groupingId = groupingId;
1046      } else {
1047        this.user = null;
1048        this.photo = null;
1049        this.grouping = TAGS;
1050        this.groupingId = photo;
1051      }
1052    }
1053
1054    public String getUser() { return user; }
1055    public String getPhoto() { return photo; }
1056    public String getGrouping() { return grouping; }
1057    public String getGroupingId() { return groupingId; }
1058
1059    public boolean isHtml() { return false; }
1060    public boolean isMedia() { return true; }
1061
1062    /**
1063     * Returns a FlickrPhoto object if the given url is to a photo or Flickr
1064     * user.
1065     */
1066    public static FlickrPhoto matchURL(String url, String text) {
1067      Matcher m = GROUPING_PATTERN.matcher(url);
1068      if (m.matches()) {
1069        return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text);
1070      }
1071
1072      m = URL_PATTERN.matcher(url);
1073      if (m.matches()) {
1074        return new FlickrPhoto(m.group(1), m.group(2), null, null, text);
1075      } else {
1076        return null;
1077      }
1078    }
1079
1080    public List<String> getInfo() {
1081      List<String> info = super.getInfo();
1082      info.add(getUrl());
1083      info.add(getUser() != null ? getUser() : "");
1084      info.add(getPhoto() != null ? getPhoto() : "");
1085      info.add(getGrouping() != null ? getGrouping() : "");
1086      info.add(getGroupingId() != null ? getGroupingId() : "");
1087      return info;
1088    }
1089
1090    public String getUrl() {
1091      if (SETS.equals(grouping)) {
1092        return getUserSetsURL(user, groupingId);
1093      } else if (TAGS.equals(grouping)) {
1094        if (user != null) {
1095          return getUserTagsURL(user, groupingId);
1096        } else {
1097          return getTagsURL(groupingId);
1098        }
1099      } else if (photo != null) {
1100        return getPhotoURL(user, photo);
1101      } else {
1102        return getUserURL(user);
1103      }
1104    }
1105
1106    /** Returns the URL for the RSS description. */
1107    public static String getRssUrl(String user) {
1108      return null;
1109    }
1110
1111    /** Returns the URL for a particular tag. */
1112    public static String getTagsURL(String tag) {
1113      return "http://flickr.com/photos/tags/" + tag;
1114    }
1115
1116    /** Returns the URL to the user's Flickr homepage. */
1117    public static String getUserURL(String user) {
1118      return "http://flickr.com/photos/" + user;
1119    }
1120
1121    /** Returns the URL for a particular photo. */
1122    public static String getPhotoURL(String user, String photo) {
1123      return "http://flickr.com/photos/" + user + "/" + photo;
1124    }
1125
1126    /** Returns the URL for a user tag photo set. */
1127    public static String getUserTagsURL(String user, String tagId) {
1128      return "http://flickr.com/photos/" + user + "/tags/" + tagId;
1129    }
1130
1131    /** Returns the URL for user set. */
1132    public static String getUserSetsURL(String user, String setId) {
1133      return "http://flickr.com/photos/" + user + "/sets/" + setId;
1134    }
1135  }
1136
1137  /** Represents a smiley that was found in the input. */
1138  public static class Smiley extends Token {
1139    // TODO: Pass the SWF URL down to the client.
1140
1141    public Smiley(String text) {
1142      super(Type.SMILEY, text);
1143    }
1144
1145    public boolean isHtml() { return false; }
1146
1147    public List<String> getInfo() {
1148      List<String> info = super.getInfo();
1149      info.add(getRawText());
1150      return info;
1151    }
1152  }
1153
1154  /** Represents an acronym that was found in the input. */
1155  public static class Acronym extends Token {
1156    private String value;
1157    // TODO: SWF
1158
1159    public Acronym(String text, String value) {
1160      super(Type.ACRONYM, text);
1161      this.value = value;
1162    }
1163
1164    public String getValue() { return value; }
1165
1166    public boolean isHtml() { return false; }
1167
1168    public List<String> getInfo() {
1169      List<String> info = super.getInfo();
1170      info.add(getRawText());
1171      info.add(getValue());
1172      return info;
1173    }
1174  }
1175
1176  /** Represents a character that changes formatting. */
1177  public static class Format extends Token {
1178    private char ch;
1179    private boolean start;
1180    private boolean matched;
1181
1182    public Format(char ch, boolean start) {
1183      super(Type.FORMAT, String.valueOf(ch));
1184      this.ch = ch;
1185      this.start = start;
1186    }
1187
1188    public void setMatched(boolean matched) { this.matched = matched; }
1189
1190    public boolean isHtml() { return true; }
1191
1192    public String toHtml(boolean caps) {
1193      // This character only implies special formatting if it was matched.
1194      // Otherwise, it was just a plain old character.
1195      if (matched) {
1196        return start ? getFormatStart(ch) : getFormatEnd(ch);
1197      } else {
1198        // We have to make sure we escape HTML characters as usual.
1199        return (ch == '"') ? "&quot;" : String.valueOf(ch);
1200      }
1201    }
1202
1203    /**
1204     * Not supported. Info should not be needed for this type
1205     */
1206    public List<String> getInfo() {
1207      throw new UnsupportedOperationException();
1208    }
1209
1210    public boolean controlCaps() { return (ch == '^'); }
1211    public boolean setCaps() { return start; }
1212
1213    private String getFormatStart(char ch) {
1214      switch (ch) {
1215        case '*': return "<b>";
1216        case '_': return "<i>";
1217        case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps
1218        case '"': return "<font color=\"#999999\">\u201c";
1219        default: throw new AssertionError("unknown format '" + ch + "'");
1220      }
1221    }
1222
1223    private String getFormatEnd(char ch) {
1224      switch (ch) {
1225        case '*': return "</b>";
1226        case '_': return "</i>";
1227        case '^': return "</font></b>"; // TODO: all caps
1228        case '"': return "\u201d</font>";
1229        default: throw new AssertionError("unknown format '" + ch + "'");
1230      }
1231    }
1232  }
1233
1234  /** Adds the given token to the parsed output. */
1235  private void addToken(Token token) {
1236    tokens.add(token);
1237  }
1238
1239  /** Converts the entire message into a single HTML display string. */
1240  public String toHtml() {
1241    StringBuilder html = new StringBuilder();
1242
1243    for (Part part : parts) {
1244      boolean caps = false;
1245
1246      html.append("<p>");
1247      for (Token token : part.getTokens()) {
1248        if (token.isHtml()) {
1249          html.append(token.toHtml(caps));
1250        } else {
1251          switch (token.getType()) {
1252          case LINK:
1253            html.append("<a href=\"");
1254            html.append(((Link)token).getURL());
1255            html.append("\">");
1256            html.append(token.getRawText());
1257            html.append("</a>");
1258            break;
1259
1260          case SMILEY:
1261            // TODO: link to an appropriate image
1262            html.append(token.getRawText());
1263            break;
1264
1265          case ACRONYM:
1266            html.append(token.getRawText());
1267            break;
1268
1269          case MUSIC:
1270            // TODO: include a music glyph
1271            html.append(((MusicTrack)token).getTrack());
1272            break;
1273
1274          case GOOGLE_VIDEO:
1275            // TODO: include a Google Video icon
1276            html.append("<a href=\"");
1277            html.append(((Video)token).getURL(((Video)token).getDocID()));
1278            html.append("\">");
1279            html.append(token.getRawText());
1280            html.append("</a>");
1281            break;
1282
1283          case YOUTUBE_VIDEO:
1284            // TODO: include a YouTube icon
1285            html.append("<a href=\"");
1286            html.append(((YouTubeVideo)token).getURL(
1287                ((YouTubeVideo)token).getDocID()));
1288            html.append("\">");
1289            html.append(token.getRawText());
1290            html.append("</a>");
1291            break;
1292
1293          case PHOTO: {
1294            // TODO: include a Picasa Web icon
1295            html.append("<a href=\"");
1296            html.append(Photo.getAlbumURL(
1297                ((Photo)token).getUser(), ((Photo)token).getAlbum()));
1298            html.append("\">");
1299            html.append(token.getRawText());
1300            html.append("</a>");
1301            break;
1302          }
1303
1304          case FLICKR:
1305            // TODO: include a Flickr icon
1306            Photo p = (Photo) token;
1307            html.append("<a href=\"");
1308            html.append(((FlickrPhoto)token).getUrl());
1309            html.append("\">");
1310            html.append(token.getRawText());
1311            html.append("</a>");
1312            break;
1313
1314          default:
1315            throw new AssertionError("unknown token type: " + token.getType());
1316          }
1317        }
1318
1319        if (token.controlCaps()) {
1320          caps = token.setCaps();
1321        }
1322      }
1323      html.append("</p>\n");
1324    }
1325
1326    return html.toString();
1327  }
1328
1329  /** Returns the reverse of the given string. */
1330  protected static String reverse(String str) {
1331    StringBuilder buf = new StringBuilder();
1332    for (int i = str.length() - 1; i >= 0; --i) {
1333      buf.append(str.charAt(i));
1334    }
1335    return buf.toString();
1336  }
1337
1338  public static class TrieNode {
1339    private final HashMap<Character,TrieNode> children =
1340        new HashMap<Character,TrieNode>();
1341    private String text;
1342    private String value;
1343
1344    public TrieNode() { this(""); }
1345    public TrieNode(String text) {
1346      this.text = text;
1347    }
1348
1349    public final boolean exists() { return value != null; }
1350    public final String getText() { return text; }
1351    public final String getValue() { return value; }
1352    public void setValue(String value) { this.value = value; }
1353
1354    public TrieNode getChild(char ch) {
1355      return children.get(Character.valueOf(ch));
1356    }
1357
1358    public TrieNode getOrCreateChild(char ch) {
1359      Character key = Character.valueOf(ch);
1360      TrieNode node = children.get(key);
1361      if (node == null) {
1362        node = new TrieNode(text + String.valueOf(ch));
1363        children.put(key, node);
1364      }
1365      return node;
1366    }
1367
1368    /** Adds the given string into the trie. */
1369    public static  void addToTrie(TrieNode root, String str, String value) {
1370      int index = 0;
1371      while (index < str.length()) {
1372        root = root.getOrCreateChild(str.charAt(index++));
1373      }
1374      root.setValue(value);
1375    }
1376  }
1377
1378
1379
1380  /** Determines whether the given string is in the given trie. */
1381  private static boolean matches(TrieNode root, String str) {
1382    int index = 0;
1383    while (index < str.length()) {
1384      root = root.getChild(str.charAt(index++));
1385      if (root == null) {
1386        break;
1387      } else if (root.exists()) {
1388        return true;
1389      }
1390    }
1391    return false;
1392  }
1393
1394  /**
1395   * Returns the longest substring of the given string, starting at the given
1396   * index, that exists in the trie.
1397   */
1398  private static TrieNode longestMatch(
1399      TrieNode root, AbstractMessageParser p, int start) {
1400    return longestMatch(root, p, start, false);
1401  }
1402
1403  /**
1404   * Returns the longest substring of the given string, starting at the given
1405   * index, that exists in the trie, with a special tokenizing case for
1406   * smileys if specified.
1407   */
1408  private static TrieNode longestMatch(
1409      TrieNode root, AbstractMessageParser p, int start, boolean smiley) {
1410    int index = start;
1411    TrieNode bestMatch = null;
1412    while (index < p.getRawText().length()) {
1413      root = root.getChild(p.getRawText().charAt(index++));
1414      if (root == null) {
1415        break;
1416      } else if (root.exists()) {
1417        if (p.isWordBreak(index)) {
1418          bestMatch = root;
1419        } else if (smiley && p.isSmileyBreak(index)) {
1420          bestMatch = root;
1421        }
1422      }
1423    }
1424    return bestMatch;
1425  }
1426
1427
1428  /** Represents set of tokens that are delivered as a single message. */
1429  public static class Part {
1430    private String meText;
1431    private ArrayList<Token> tokens;
1432
1433    public Part() {
1434      this.tokens = new ArrayList<Token>();
1435    }
1436
1437    public String getType(boolean isSend) {
1438      return (isSend ? "s" : "r") + getPartType();
1439    }
1440
1441    private String getPartType() {
1442      if (isMedia()) {
1443        return "d";
1444      } else if (meText != null) {
1445        return "m";
1446      } else {
1447        return "";
1448      }
1449    }
1450
1451    public boolean isMedia() {
1452      return (tokens.size() == 1) && tokens.get(0).isMedia();
1453    }
1454    /**
1455     * Convenience method for getting the Token of a Part that represents
1456     * a media Token. Parts of this kind will always only have a single Token
1457     *
1458     * @return if this.isMedia(),
1459     *         returns the Token representing the media contained in this Part,
1460     *         otherwise returns null;
1461     */
1462    public Token getMediaToken() {
1463      if(isMedia()) {
1464        return tokens.get(0);
1465      }
1466      return null;
1467    }
1468
1469    /** Adds the given token to this part. */
1470    public void add(Token token) {
1471      if (isMedia()) {
1472        throw new AssertionError("media ");
1473      }
1474       tokens.add(token);
1475    }
1476
1477    public void setMeText(String meText) {
1478      this.meText = meText;
1479    }
1480
1481    /** Returns the original text of this part. */
1482    public String getRawText() {
1483      StringBuilder buf = new StringBuilder();
1484      if (meText != null) {
1485        buf.append(meText);
1486      }
1487      for (int i = 0; i < tokens.size(); ++i) {
1488        buf.append(tokens.get(i).getRawText());
1489      }
1490      return buf.toString();
1491    }
1492
1493    /** Returns the tokens in this part. */
1494    public ArrayList<Token> getTokens() { return tokens; }
1495
1496    /** Adds the tokens into the given builder as an array. */
1497//    public void toArray(JSArrayBuilder array) {
1498//      if (isMedia()) {
1499//        // For media, we send its array (i.e., we don't wrap this in another
1500//        // array as we do for non-media parts).
1501//        tokens.get(0).toArray(array);
1502//      } else {
1503//        array.beginArray();
1504//        addToArray(array);
1505//        array.endArray();
1506//      }
1507//    }
1508  }
1509}
1510