1/**
2 * Copyright (c) 2004, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.android.mail.lib.html.parser;
17
18import com.android.mail.lib.base.CharEscapers;
19import com.android.mail.lib.base.CharMatcher;
20import com.android.mail.lib.base.Preconditions;
21import com.android.mail.lib.base.StringUtil;
22import com.android.mail.lib.base.X;
23import com.google.common.collect.Lists;
24import com.google.common.collect.Maps;
25import com.google.common.io.ByteStreams;
26
27import java.io.IOException;
28import java.util.ArrayList;
29import java.util.HashMap;
30import java.util.LinkedList;
31import java.util.List;
32import java.util.ListIterator;
33import java.util.regex.Matcher;
34import java.util.regex.Pattern;
35
36/**
37 * HtmlParser is a simple but efficient html parser.
38 * - It's simple because it does not do incremental parsing like some other
39 * parser. It assumes that the entire html text is available.
40 * - It offers 3 levels of aggressiveness in correcting errors in HTML (see
41 * HtmlParser.ParseStyle).
42 * - HTML comments are ignored unless initialized with ParseStyle.PRESERVE_ALL.
43 */
44public class HtmlParser {
45
46  // States
47  private enum State {
48    IN_TEXT, IN_TAG, IN_COMMENT, IN_CDATA
49  }
50
51  // The current state
52  private State state;
53
54  private int clipLength = Integer.MAX_VALUE;
55  private boolean clipped;
56
57  // The html text
58  private String html;
59
60  // The entire array of nodes
61  private List<HtmlDocument.Node> nodes;
62
63  // Turn on for debug information.
64  private static boolean DEBUG = false;
65
66  // Default whitelist
67  public static final HtmlWhitelist DEFAULT_WHITELIST = HTML4.getWhitelist();
68
69  // Whitelists for looking up accepted HTML tags and attributes
70  private List<HtmlWhitelist> whitelists = Lists.newArrayList(DEFAULT_WHITELIST);
71
72  /**
73   * This setting controls how much of the original HTML is preserved.  In
74   * ascending order of aggressiveness:
75   * - PRESERVE_ALL: Preserves all of original content.
76   * *** Warning - PRESERVE_ALL mode preserves invalid and unsafe HTML. ***
77   * - PRESERVE_VALID: Preserves only valid visible HTML handled by
78   * most browsers.  Discards comments, unknown tags and attributes, and
79   * nameless end tags.  Encodes all '<' characters that aren't part of a tag.
80   * - NORMALIZE: In addition to the changes made by PRESERVE_VALID, also
81   *   - unescapes then reescapes text to normalize entities
82   *   - normalizes whitespace and quotes around tag attributes
83   */
84  public enum ParseStyle { NORMALIZE, PRESERVE_VALID, PRESERVE_ALL }
85
86  /**
87   * True only in PRESERVE_ALL mode.
88   * @see HtmlParser.ParseStyle
89   */
90  private final boolean preserveAll;
91
92  /**
93   * True in either PRESERVE_ALL or PRESERVE_VALID mode.
94   * @see HtmlParser.ParseStyle
95   */
96  private final boolean preserveValidHtml;
97
98  /**
99   * @see HtmlParser#HtmlParser(HtmlParser.ParseStyle)
100   */
101  public HtmlParser() {
102    this(ParseStyle.NORMALIZE);
103  }
104
105  /**
106   * @param parseStyle Level of aggressiveness for how different
107   * toHTML()/toXHTML() are from original.
108   * @see HtmlParser.ParseStyle
109   */
110  public HtmlParser(ParseStyle parseStyle) {
111    preserveAll = (parseStyle == ParseStyle.PRESERVE_ALL);
112    preserveValidHtml = preserveAll || (parseStyle == ParseStyle.PRESERVE_VALID);
113  }
114
115  /**
116   * Sets the maximum length, in characters, of an HTML message.
117   *
118   * @param clipLength must be greater than zero.
119   * (It starts as Integer.MAX_VALUE)
120   */
121  public void setClipLength(int clipLength) {
122    if (clipLength <= 0) {
123      throw new IllegalArgumentException(
124        "clipLength '" + clipLength + "' <= 0");
125    }
126    this.clipLength = clipLength;
127  }
128
129  public boolean isClipped() {
130    return clipped;
131  }
132
133  /**
134   * Sets the HTML whitelist. Calling this overrides any whitelist(s) that
135   * the parser is configured to use. By default, the parser uses the standard
136   * HTML4 whitelist.
137   *
138   * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
139   *
140   * @param whitelist The whitelist to use. Must not be null.
141   */
142  public void setWhitelist(HtmlWhitelist whitelist) {
143    Preconditions.checkNotNull(whitelist);
144    whitelists = Lists.newArrayList(whitelist);
145  }
146
147  /**
148   * Adds an HTML whitelist to the list of whitelists consulted when
149   * processing an element or attribute. By default, the parser only uses
150   * the standard HTML4 whitelist.
151   *
152   * Whitelists are consulted in reverse chronological order (starting from
153   * the most recently added whitelist). The last whitelist consulted will
154   * always be the standard HTML4 whitelist, unless this was overridden by
155   * a call to {@link #setWhitelist}.
156   *
157   * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
158   *
159   * @param whitelist The whitelist to use.
160   */
161  public void addWhitelist(HtmlWhitelist whitelist) {
162    whitelists.add(whitelist);
163  }
164
165  /**
166   * These are characters that we don't want to allow unquoted in an attribute
167   * value because they might be interpreted by the browser as HTML control
168   * characters. They are the 5 characters that are escaped by
169   * com.google.common.base.CharEscapers.HTML_ESCAPE, plus '=' and whitespace.
170   * Note that it shouldn't be possible for '>' or whitespace to be parsed as
171   * part of an unquoted attribute value, but we leave them here for
172   * completeness.
173   * Package visibility for unit tests.
174   */
175  static Pattern NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX = Pattern.compile("[\"\'&<>=\\s]");
176
177  //------------------------------------------------------------------------
178  // Parsing
179  //------------------------------------------------------------------------
180
181  /**
182   * Parses a String as HTML.
183   *
184   * @param html String to parse
185   * @return an Html document
186   */
187  public HtmlDocument parse(String html) {
188    this.html = html;
189    // Use a LinkedList because we don't know the number of nodes ahead of
190    // time. This will be compacted into an ArrayList in coalesceTextNodes().
191    nodes = Lists.newLinkedList();
192    state = State.IN_TEXT;
193
194    clipped = false;
195    int end = html.length();
196    int clipEnd = Math.min(clipLength, end);
197
198    for (int i = 0; i < end && !clipped;) {
199
200      // At any one time, the parser is in one of these states:
201      int pos;
202      switch (state) {
203        case IN_TEXT:
204          // text will not attempt to parse beyond the clipping length
205          pos = scanText(i, clipEnd);
206          X.assertTrue(pos > i || state != State.IN_TEXT); // Must make progress.
207          break;
208
209        case IN_TAG:
210          pos = scanTag(i, end);
211          X.assertTrue(pos > i);        // Must make progress
212          break;
213
214        case IN_COMMENT:
215          pos = scanComment(i, end);
216          state = State.IN_TEXT;
217          X.assertTrue(pos > i);        // Must make progress
218          break;
219
220        case IN_CDATA:
221          pos = scanCDATA(i, end);
222          X.assertTrue(pos > i || state != State.IN_CDATA); // Must make progress
223          break;
224
225        default:
226          throw new Error("Unknown state!");
227      }
228
229      i = pos;
230
231      // If we've reached or gone beyond the clipping length, stop.
232      clipped = pos >= clipLength;
233    }
234
235    nodes = coalesceTextNodes(nodes);
236
237    HtmlDocument doc = new HtmlDocument(nodes);
238    nodes = null;
239    html = null;
240    return doc;
241  }
242
243  /**
244   * During the course of parsing, we may have multiple adjacent Text nodes,
245   * due to the sanitizer stripping out nodes between Text nodes. It is
246   * important to coalesce them so that later steps in the pipeline can
247   * treat the text as a single block (e.g. the step that inserts <wbr> tags).
248   * @param nodes Original nodes.
249   * @return Nodes with text nodes changed.
250   */
251  static List<HtmlDocument.Node> coalesceTextNodes(
252      List<HtmlDocument.Node> nodes) {
253    List<HtmlDocument.Node> out =
254        new ArrayList<HtmlDocument.Node>(nodes.size());
255    LinkedList<HtmlDocument.Text> textNodes = Lists.newLinkedList();
256
257    for (HtmlDocument.Node node : nodes) {
258      if (node instanceof HtmlDocument.Text) {
259        textNodes.add((HtmlDocument.Text) node);
260      } else {
261        mergeTextNodes(textNodes, out);
262        out.add(node);
263      }
264    }
265    mergeTextNodes(textNodes, out);
266    return out;
267  }
268
269  /**
270   * Flushes any Text nodes in {@code textNodes} into a single Text node
271   * in {@code output}. {@code textNodes} is guaranteed to be empty when
272   * the function returns.
273   * @param textNodes Text nodes.
274   * @param output Destination to which results are added.
275   */
276  private static void mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes,
277                                     List<HtmlDocument.Node> output) {
278    if (!textNodes.isEmpty()) {
279      if (textNodes.size() == 1) {
280        output.add(textNodes.removeFirst());
281      } else {
282        int combinedTextLen = 0;
283        int combinedInputLen = 0;
284        for (HtmlDocument.Text text : textNodes) {
285          combinedTextLen += text.getText().length();
286          if (text.getOriginalHTML() != null) {
287            combinedInputLen += text.getOriginalHTML().length();
288          }
289        }
290        StringBuilder combinedText = new StringBuilder(combinedTextLen);
291        StringBuilder combinedInput = new StringBuilder(combinedInputLen);
292        while (!textNodes.isEmpty()) {
293          HtmlDocument.Text text = textNodes.removeFirst();
294          combinedText.append(text.getText());
295          if (text.getOriginalHTML() != null) {
296            combinedInput.append(text.getOriginalHTML());
297          }
298        }
299        String originalInput = combinedInputLen > 0 ? combinedInput.toString() : null;
300        output.add(HtmlDocument.createText(combinedText.toString(), originalInput));
301      }
302    }
303  }
304
305  //------------------------------------------------------------------------
306  // Text scanning
307  //------------------------------------------------------------------------
308  /**
309   * A truncated entity is something like <pre>&nbs or &#1a3</pre>.
310   * We only want to find these at the end of a clipped text.
311   */
312  private static final Pattern TRUNCATED_ENTITY =
313    Pattern.compile("\\& \\#? [0-9a-zA-Z]{0,8} $", Pattern.COMMENTS);
314
315  /**
316   * In a text mode, scan for a tag
317   * @param start Position in original html.
318   * @param end Position in original html.
319   * @return End position of scanned content.
320   */
321  int scanText(final int start, final int end) {
322    int pos;
323    for (pos = start; pos < end; pos++) {
324      char ch = html.charAt(pos);
325      if (ch == '<' && pos + 1 < end) {
326        // Check the next char
327        ch = html.charAt(pos + 1);
328        if (ch == '/' || Character.isLetter(ch) || ch == '!' || ch == '?') {
329
330          // Check if it's an html comment or tag
331          if (html.regionMatches(pos + 1, "!--", 0, 3)) {
332            state = State.IN_COMMENT;
333          } else {
334            state = State.IN_TAG;
335          }
336          break;
337        }
338      }
339    }
340
341    if (pos > start) {
342      int finalPos = pos;
343      String htmlTail = this.html.substring(start, finalPos);
344
345      if ((pos == clipLength) && (clipLength < html.length())) {
346        // We're clipping this HTML, not running off the end.
347        // If we're ending with what looks like a truncated entity,
348        // then clip that part off, too.
349        // If it really was a truncated entity, great.
350        // If it was a false positive, the user won't notice that we clipped
351        // an additional handful of characters.
352        Matcher matcher = TRUNCATED_ENTITY.matcher(htmlTail);
353        if (matcher.find()) {
354          int matchStart = matcher.start();
355          // The matcher matched in htmlTail, not html.
356          // htmlTail starts at html[start]
357          finalPos = start + matchStart;
358          htmlTail = htmlTail.substring(0, matchStart);
359        }
360      }
361
362      if (finalPos > start) {
363        String originalHtml = null;
364        if (preserveAll) {
365          originalHtml = htmlTail;
366        } else if (preserveValidHtml) {
367          // the only way htmlTail can start with '<' is if it's the last character
368          // in html; otherwise, we would have entered State.IN_TAG or
369          // State.IN_COMMENT above
370
371          // officially a '<' can be valid in a text node, but to be safe we
372          // always escape them
373          originalHtml = CharMatcher.is('<').replaceFrom(htmlTail, "&lt;");
374        }
375
376        HtmlDocument.Text textnode = HtmlDocument.createEscapedText(htmlTail, originalHtml);
377        nodes.add(textnode);
378      }
379    }
380    return pos;
381  }
382
383  //------------------------------------------------------------------------
384  // Tag name scanning utility class
385  //------------------------------------------------------------------------
386  private static class TagNameScanner {
387    private final String html;
388    private String tagName;
389    private int startNamePos = -1;
390    private int endNamePos = -1;
391
392    public TagNameScanner(String html) {
393      this.html = html;
394    }
395
396    /**
397     * Scans for a tag name. Sets #startNamePos and #endNamePos.
398     * @param start Position in original html.
399     * @param end Position in original html.
400     * @return End position of scanned content.
401     */
402    public int scanName(final int start, final int end) {
403      int pos;
404      for (pos = start; pos < end; pos++) {
405        char ch = html.charAt(pos);
406
407        // End of tag or end of name.
408        if ((ch == '>') || (ch == '/') || Character.isWhitespace(ch)) {
409          break;
410        }
411      }
412      if (pos > start) {
413        startNamePos = start;
414        endNamePos = pos;
415      }
416      return pos;
417    }
418
419    /**
420     * @return Tag name.
421     */
422    public String getTagName() {
423      if (tagName == null && startNamePos != -1 && endNamePos != -1) {
424        tagName = html.substring(startNamePos, endNamePos);
425      }
426      return tagName;
427    }
428  }
429
430  //------------------------------------------------------------------------
431  // Attribute scanning utility class
432  //------------------------------------------------------------------------
433  private static class AttributeScanner {
434    private final String html;
435    private String name;
436    private String value;
437
438    // The following have package visibility because they are accessed from
439    // HtmlParser.addAttribute() to handle preservation of original content
440    // around the attribute value, but quoting and escaping of the value itself.
441    int startNamePos = -1;
442    int endNamePos = -1;
443    int startValuePos = -1;
444    int endValuePos = -1;
445    boolean attrValueIsQuoted = false;
446
447    public AttributeScanner(String html) {
448      this.html = html;
449    }
450
451    /**
452     * Reset to scan another attribute.
453     */
454    public void reset() {
455      startNamePos = -1;
456      endNamePos = -1;
457      startValuePos = -1;
458      endValuePos = -1;
459      attrValueIsQuoted = false;
460      name = null;
461      value = null;
462    }
463
464    /**
465     * Scans for a tag attribute name. Sets startNamePos and endNamePos. Sets
466     * 'attrName'.
467     *
468     * @param start Position in original html
469     * @param end Position in original html
470     * @return End position of scanned content
471     */
472    int scanName(final int start, final int end) {
473      X.assertTrue(html.charAt(start) != '>');
474      if (start == end) {
475        // No attribute name
476        return start;
477      }
478
479      int pos;
480      for (pos = start + 1; pos < end; pos++) {
481        char ch = html.charAt(pos);
482
483        // End of tag or end of name.
484        if ((ch == '>') || (ch == '=') || (ch == '/') || Character.isWhitespace(ch)) {
485          break;
486        }
487      }
488      startNamePos = start;
489      endNamePos = pos;
490      return pos;
491    }
492
493    /**
494     * Scans for a tag attribute value. Sets startValuePos_ and endValuePos_.
495     *
496     * @param start Position in original html
497     * @param end Position in original html
498     * @return End position of scanned content
499     */
500    int scanValue(final int start, final int end) {
501      // Skip whitespace before '='.
502      int pos = skipSpaces(start, end);
503
504      // Handle cases with no attribute value.
505      if ((pos == end) || (html.charAt(pos) != '=')) {
506        // Return start so spaces will be parsed as part of next attribute,
507        // or end of tag.
508        return start;
509      }
510
511      // Skip '=' and whitespace after it.
512      pos++;
513      pos = skipSpaces(pos, end);
514
515      // Handle cases with no attribute value.
516      if (pos == end) {
517        return pos;
518      }
519
520      // Check for quote character ' or "
521      char ch = html.charAt(pos);
522      if (ch == '\'' || ch == '\"') {
523        attrValueIsQuoted = true;
524        pos++;
525        int valueStart = pos;
526        while (pos < end && html.charAt(pos) != ch) {
527          pos++;
528        }
529        startValuePos = valueStart;
530        endValuePos = pos;
531        if (pos < end) {
532          pos++;                        // Skip the ending quote char
533        }
534      } else {
535        int valueStart = pos;
536        for (; pos < end; pos++) {
537          ch = html.charAt(pos);
538
539          // End of tag or end of value. Not that '/' is included in the value
540          // even if it is the '/>' at the end of the tag.
541          if ((ch == '>') || Character.isWhitespace(ch)) {
542            break;
543          }
544        }
545        startValuePos = valueStart;
546        endValuePos = pos;
547      }
548
549      X.assertTrue(startValuePos > -1);
550      X.assertTrue(endValuePos > -1);
551      X.assertTrue(startValuePos <= endValuePos);
552      X.assertTrue(pos <= end);
553
554      return pos;
555    }
556
557    /**
558     * Skips white spaces.
559     *
560     * @param start Position in original html
561     * @param end Position in original html
562     * @return End position of scanned content
563     */
564    private int skipSpaces(final int start, final int end) {
565      int pos;
566      for (pos = start; pos < end; pos++) {
567        if (!Character.isWhitespace(html.charAt(pos))) {
568          break;
569        }
570      }
571      return pos;
572    }
573
574    public String getName() {
575      if (name == null && startNamePos != -1 && endNamePos != -1) {
576        name = html.substring(startNamePos, endNamePos);
577      }
578      return name;
579    }
580
581    public String getValue() {
582      if (value == null && startValuePos != -1 && endValuePos != -1) {
583        value = html.substring(startValuePos, endValuePos);
584      }
585      return value;
586    }
587  }
588
589  /**
590   * Holds any unrecognized elements we encounter.  Only applicable in
591   * PRESERVE_ALL mode.
592   */
593  private final HashMap<String,HTML.Element> unknownElements = Maps.newHashMap();
594
595  /**
596   * Holds any unrecognized attributes we encounter.  Only applicable in
597   * PRESERVE_ALL mode.
598   */
599  private final HashMap<String,HTML.Attribute> unknownAttributes = Maps.newHashMap();
600
601  /**
602   * @param name Element name.
603   * @return "Dummy" element.  Not useful for any real HTML processing, but
604   * gives us a placeholder for tracking original HTML contents.
605   */
606  private HTML.Element lookupUnknownElement(String name) {
607    name = name.toLowerCase();
608    HTML.Element result = unknownElements.get(name);
609    if (result == null) {
610      result = new HTML.Element(name,
611          HTML.Element.NO_TYPE,
612          /* empty */ false,
613          /* optional end tag */ true,
614          /* breaks flow*/ false,
615          HTML.Element.Flow.NONE);
616      unknownElements.put(name, result);
617    }
618    return result;
619  }
620
621  /**
622   * @param name Attribute name.
623   * @return "Dummy" attribute. Not useful for any real HTML processing, but
624   *         gives us a placeholder for tracking original HTML contents.
625   */
626  private HTML.Attribute lookupUnknownAttribute(String name) {
627    name = name.toLowerCase();
628    HTML.Attribute result = unknownAttributes.get(name);
629    if (result == null) {
630      result = new HTML.Attribute(name, HTML.Attribute.NO_TYPE);
631      unknownAttributes.put(name, result);
632    }
633    return result;
634  }
635
636  /**
637   * Scans for an HTML tag.
638   *
639   * @param start Position in original html.
640   * @param end Position in original html.
641   * @return End position of scanned content.
642   */
643  int scanTag(final int start, final int end) {
644    X.assertTrue(html.charAt(start) == '<');
645
646    // nameStart is where we begin scanning for the tag name and attributes,
647    // so we skip '<'.
648    int nameStart = start + 1;
649
650    // Next state is Text, except the case when we see a STYLE/SCRIPT tag. See
651    // code below.
652    state = State.IN_TEXT;
653
654    // End tag?
655    boolean isEndTag = false;
656    if (html.charAt(nameStart) == '/') {
657      isEndTag = true;
658      ++nameStart;
659    }
660
661    // Tag name and element
662    TagNameScanner tagNameScanner = new TagNameScanner(html);
663    int pos = tagNameScanner.scanName(nameStart, end);
664    String tagName = tagNameScanner.getTagName();
665    HTML.Element element = null;
666    if (tagName == null) {
667      // For some reason, browsers treat start and end tags differently
668      // when they don't have a valid tag name - end tags are swallowed
669      // (e.g., "</ >"), start tags treated as text (e.g., "< >")
670      if (!isEndTag) {
671        // This is not really a tag, treat the '<' as text.
672        HtmlDocument.Text text = HtmlDocument.createText("<", preserveAll ? "<" : null);
673        nodes.add(text);
674        state = State.IN_TEXT;
675        return nameStart;
676      }
677
678      if (preserveAll) {
679        element = lookupUnknownElement("");
680      }
681    } else {
682      element = lookupElement(tagName);
683      if (element == null) {
684        if (DEBUG) {
685          // Unknown element
686          debug("Unknown element: " + tagName);
687        }
688        if (preserveAll) {
689          element = lookupUnknownElement(tagName);
690        }
691      }
692    }
693
694    // Attributes
695    boolean isSingleTag = false;
696    ArrayList<HtmlDocument.TagAttribute> attributes = null;
697    int allAttributesStartPos = pos;
698    int nextAttributeStartPos = pos;
699    AttributeScanner attributeScanner = new AttributeScanner(html);
700    while (pos < end) {
701      int startPos = pos;
702      char ch = html.charAt(pos);
703
704      // Are we at the end of the tag?
705      if ((pos + 1 < end) && (ch == '/') && (html.charAt(pos + 1) == '>')) {
706        isSingleTag = true;
707        ++pos;
708        break;                          // Done
709      }
710      if (ch == '>') {
711        break;                          // Done
712      }
713
714      // See bug 870742 (Buganizer).
715      if (isEndTag && ('<' == ch)) {
716        // '<' not allowed in end tag, so we finish processing this tag and
717        // return to State.IN_TEXT. We mimic Safari & Firefox, which both
718        // terminate the tag when it contains a '<'.
719        if (element != null) {
720          addEndTag(element, start, allAttributesStartPos, pos);
721        }
722        state = State.IN_TEXT;
723        return pos;
724      }
725
726      if (Character.isWhitespace(ch)) {
727        // White space, skip it.
728        ++pos;
729      } else {
730        // Scan for attribute
731        attributeScanner.reset();
732        pos = attributeScanner.scanName(pos, end);
733        X.assertTrue(pos > startPos);
734
735        // If it's a valid attribute, scan attribute values
736        if (attributeScanner.getName() != null) {
737          pos = attributeScanner.scanValue(pos, end);
738
739          // Add the attribute to the list
740          if (element != null) {
741            if (attributes == null) {
742              attributes = new ArrayList<HtmlDocument.TagAttribute>();
743            }
744            addAttribute(attributes, attributeScanner, nextAttributeStartPos, pos);
745          }
746          nextAttributeStartPos = pos;
747        }
748      }
749
750      // Make sure that we make progress!
751      X.assertTrue(pos > startPos);
752    }
753
754    // Cannot find the close tag, so we treat this as text
755    if (pos == end) {
756      X.assertTrue(start < end);
757      String textNodeContent = html.substring(start, end);
758      String originalContent = null;
759      if (preserveAll) {
760        originalContent = textNodeContent;
761      } else if (preserveValidHtml) {
762        // Officially a '<' can be valid in a text node, but to be safe we
763        // always escape them.
764        originalContent =
765            CharMatcher.is('<').replaceFrom(html.substring(start, end), "&lt;");
766      }
767      nodes.add(HtmlDocument.createEscapedText(textNodeContent, originalContent));
768      return end;
769    }
770
771    // Skip '>'
772    X.assertTrue(html.charAt(pos) == '>');
773    pos++;
774
775    // Check if it's an element we're keeping (either an HTML4 element, or an
776    // unknown element we're preserving). If not, ignore the tag.
777    if (element != null) {
778      if (isEndTag) {
779        addEndTag(element, start, allAttributesStartPos, pos);
780      } else {
781        // Special case: if it's a STYLE/SCRIPT element, we go to into
782        // CDATA state.
783        if (HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element)) {
784          state = State.IN_CDATA;
785        }
786
787        addStartTag(element, start, allAttributesStartPos,
788            nextAttributeStartPos,
789            pos, isSingleTag, attributes);
790      }
791    }
792
793    return pos;
794  }
795
796  /**
797   * Lookups the element in our whitelist(s). Whitelists are consulted in
798   * reverse chronological order (starting from the most recently added
799   * whitelist), allowing clients to override the default behavior.
800   *
801   * @param name Element name.
802   * @return Element.
803   */
804  HTML.Element lookupElement(String name) {
805    ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
806    while (iter.hasPrevious()) {
807      HTML.Element elem = iter.previous().lookupElement(name);
808      if (elem != null) {
809        return elem;
810      }
811    }
812    return null;
813  }
814
815  /**
816   * Lookups the attribute in our whitelist(s). Whitelists are consulted in
817   * reverse chronological order (starting from the most recently added
818   * whitelist), allowing clients to override the default behavior.
819   *
820   * @param name Attribute name.
821   * @return Attribute.
822   */
823  HTML.Attribute lookupAttribute(String name) {
824    ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
825    while (iter.hasPrevious()) {
826      HTML.Attribute attr = iter.previous().lookupAttribute(name);
827      if (attr != null) {
828        return attr;
829      }
830    }
831    return null;
832  }
833
834  /**
835   * @param element Tag element
836   * @param startPos Start of tag, including '<'
837   * @param startAttributesPos Start of attributes. This is the first
838   * character after the tag name. If there are no attributes, this is the end
839   * of the tag.
840   * @param endAttributesPos First position after last attribute
841   * @param endPos End of tag, including '>' character
842   * @param isSingleTag True iff this is a self-terminating tag
843   * @param attributes Tag attributes
844   */
845  private void addStartTag(HTML.Element element, final int startPos,
846      final int startAttributesPos, final int endAttributesPos,
847      final int endPos, final boolean isSingleTag,
848      ArrayList<HtmlDocument.TagAttribute> attributes) {
849    X.assertTrue(startPos < startAttributesPos);
850    X.assertTrue(startAttributesPos <= endAttributesPos);
851    X.assertTrue(endAttributesPos <= endPos);
852
853    if (preserveAll) {
854      String beforeAttrs = html.substring(startPos, startAttributesPos);
855      String afterAttrs = html.substring(endAttributesPos, endPos);
856      HtmlDocument.Tag tag = (isSingleTag)
857          ? HtmlDocument.createSelfTerminatingTag(element, attributes,
858              beforeAttrs, afterAttrs)
859          : HtmlDocument.createTag(element, attributes,
860              beforeAttrs, afterAttrs);
861      nodes.add(tag);
862    } else if (preserveValidHtml) {
863      // This is the beginning of the tag up through the tag name. It should not
864      // be possible for this to contain characters needing escaping, but we add
865      // this redundant check to avoid an XSS attack that might get past our
866      // parser but trick a browser into executing a script.
867      X.assertTrue(html.charAt(startPos) == '<');
868      StringBuilder beforeAttrs = new StringBuilder("<");
869      String tagName = html.substring(startPos + 1, startAttributesPos);
870      beforeAttrs.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
871
872      // Verify end-of-tag characters
873      int endContentPos = endPos - 1;
874      X.assertTrue(html.charAt(endContentPos) == '>');
875      if (isSingleTag) {
876        --endContentPos;
877        X.assertTrue(html.charAt(endContentPos) == '/');
878      }
879      X.assertTrue(endAttributesPos <= endContentPos);
880
881      // This is any extra characters between the last attribute and the end of
882      // the tag.
883      X.assertTrue(endAttributesPos < endPos);
884      String afterAttrs = html.substring(endAttributesPos, endPos);
885
886      // Strip all but preceding whitespace.
887      HtmlDocument.Tag tag = (isSingleTag)
888          ? HtmlDocument.createSelfTerminatingTag(element, attributes,
889              beforeAttrs.toString(), afterAttrs)
890          : HtmlDocument.createTag(element, attributes,
891              beforeAttrs.toString(), afterAttrs);
892      nodes.add(tag);
893    } else {
894      // Normalize.
895      HtmlDocument.Tag tag = (isSingleTag)
896          ? HtmlDocument.createSelfTerminatingTag(element, attributes)
897          : HtmlDocument.createTag(element, attributes);
898      nodes.add(tag);
899    }
900  }
901
902 /**
903   * @param element End tag element.
904   * @param startPos Start of tag, including '<'.
905   * @param startAttributesPos Start of attributes. This is the first
906   * character after the tag name. If there are no attributes, this is the end
907   * of the tag.
908   * @param endPos End of tag. This usually contains the '>' character, but in
909   * the case where browsers force a termination of a malformed tag, it doesn't.
910   */
911  private void addEndTag(HTML.Element element, final int startPos,
912      final int startAttributesPos, final int endPos) {
913    X.assertTrue(element != null);
914    X.assertTrue(html.charAt(startPos) == '<');
915    X.assertTrue(html.charAt(startPos + 1) == '/');
916
917    if (preserveAll) {
918      // Preserve all: keep actual content even if it's malformed.
919      X.assertTrue(startPos < endPos);
920      String content = html.substring(startPos, endPos);
921      nodes.add(HtmlDocument.createEndTag(element, content));
922    } else if (preserveValidHtml) {
923      // Preserve valid: terminate the tag.
924
925      StringBuilder validContent = new StringBuilder("</");
926
927      // This is the beginning of the tag up through the tag name. It should not
928      // be possible for this to contain characters needing escaping, but we add
929      // this redundant check to avoid an XSS attack that might get past our
930      // parser but trick a browser into executing a script.
931      X.assertTrue(startPos < startAttributesPos);
932      String tagName = html.substring(startPos + 2, startAttributesPos);
933      validContent.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
934
935      // This is the rest of the tag, including any attributes.
936      // See bug 874396 (Buganizer). We don't allow attributes in an end tag.
937      X.assertTrue(startAttributesPos <= endPos);
938      String endOfTag = html.substring(startAttributesPos, endPos);
939      if (endOfTag.charAt(endOfTag.length() - 1) != '>') {
940        endOfTag += '>';
941      }
942
943      // Strip everything but leading whitespace.
944      validContent.append(endOfTag.replaceAll("\\S+.*>", ">"));
945
946      nodes.add(HtmlDocument.createEndTag(element, validContent.toString()));
947    } else {
948      // Normalize: ignore the original content.
949      nodes.add(HtmlDocument.createEndTag(element));
950    }
951  }
952
953  /**
954   * Creates and adds an attribute to the list.
955   *
956   * @param attributes Destination of new attribute.
957   * @param scanner Scanned attribute.
958   * @param startPos start position (inclusive) in original HTML of this
959   *        attribute, including preceeding separator characters (generally this
960   *        is whitespace, but it might contain other characters). This is the
961   *        end position of the tag name or previous attribute +1.
962   * @param endPos end position (exclusive) in original HTML of this attribute.
963   */
964  private void addAttribute(ArrayList<HtmlDocument.TagAttribute> attributes,
965      AttributeScanner scanner, final int startPos, final int endPos) {
966    X.assertTrue(startPos < endPos);
967
968    String name = scanner.getName();
969    X.assertTrue(name != null);
970    HTML.Attribute htmlAttribute = lookupAttribute(name);
971
972    // This can be null when there's no value, e.g., input.checked attribute.
973    String value = scanner.getValue();
974
975    if (htmlAttribute == null) {
976      // Unknown attribute.
977      if (DEBUG) {
978        debug("Unknown attribute: " + name);
979      }
980      if (preserveAll) {
981        String original = html.substring(startPos, endPos);
982        attributes.add(HtmlDocument.createTagAttribute(
983            lookupUnknownAttribute(name), value, original));
984      }
985    } else {
986      String unescapedValue = (value == null) ? null : StringUtil.unescapeHTML(value);
987      if (preserveAll) {
988        attributes.add(HtmlDocument.createTagAttribute(htmlAttribute,
989            unescapedValue, html.substring(startPos, endPos)));
990      } else if (preserveValidHtml) {
991        StringBuilder original = new StringBuilder();
992
993        // This includes any separator characters between the tag name or
994        // preceding attribute and this one.
995        // This addresses bugs 870757 and 875303 (Buganizer).
996        // Don't allow non-whitespace separators between attributes.
997        X.assertTrue(startPos <= scanner.startNamePos);
998        String originalPrefix = html.substring(
999            startPos, scanner.startNamePos).replaceAll("\\S+", "");
1000        if (originalPrefix.length() == 0) {
1001          originalPrefix = " ";
1002        }
1003        original.append(originalPrefix);
1004
1005        if (value == null) {
1006          // This includes the name and any following whitespace. Escape in case
1007          // the name has any quotes or '<' that could confuse a browser.
1008          X.assertTrue(scanner.startNamePos < endPos);
1009          String nameEtc = html.substring(scanner.startNamePos, endPos);
1010          original.append(CharEscapers.asciiHtmlEscaper().escape(nameEtc));
1011        } else {
1012          // Escape name in case the name has any quotes or '<' that could
1013          // confuse a browser.
1014          original.append(CharEscapers.asciiHtmlEscaper().escape(name));
1015
1016          // This includes the equal sign, and any other whitespace
1017          // between the name and value. It also contains the opening quote
1018          // character if there is one.
1019          X.assertTrue(scanner.endNamePos < scanner.startValuePos);
1020          original.append(html.substring(scanner.endNamePos, scanner.startValuePos));
1021
1022          // This is the value, excluding any quotes.
1023          if (scanner.attrValueIsQuoted) {
1024            // Officially a '<' can be valid in an attribute value, but to be
1025            // safe we always escape them.
1026            original.append(value.replaceAll("<", "&lt;"));
1027          } else {
1028            // This addresses bug 881426 (Buganizer). Put quotes around any
1029            // dangerous characters, which is what most of the browsers do.
1030            if (NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX.matcher(value).find()) {
1031              original.append('"');
1032              original.append(value.replaceAll("\"", "&quot;"));
1033              original.append('"');
1034            } else {
1035              original.append(value);
1036            }
1037          }
1038
1039          // This includes end quote, if applicable.
1040          X.assertTrue(scanner.endValuePos <= endPos);
1041          original.append(html.substring(scanner.endValuePos, endPos));
1042        }
1043
1044        attributes.add(HtmlDocument.createTagAttribute(
1045            htmlAttribute, unescapedValue, original.toString()));
1046      } else {
1047        attributes.add(HtmlDocument.createTagAttribute(
1048            htmlAttribute, unescapedValue));
1049      }
1050    }
1051  }
1052
1053  //------------------------------------------------------------------------
1054  // Comment scanning
1055  //------------------------------------------------------------------------
1056  private static final String START_COMMENT = "<!--";
1057  private static final String END_COMMENT = "-->";
1058
1059  private int scanComment(final int start, final int end) {
1060
1061    X.assertTrue(html.regionMatches(start, START_COMMENT, 0, START_COMMENT.length()));
1062
1063    // Scan for end of comment
1064    int pos = html.indexOf(END_COMMENT, start + START_COMMENT.length());
1065    if (pos != -1) {
1066      pos += END_COMMENT.length();
1067    } else {
1068      // Look for '>'. If we can't find that, the rest of the text is comments.
1069      pos = html.indexOf('>', start + 4);
1070      if (pos != -1) {
1071        ++pos;
1072      } else {
1073        pos = end;
1074      }
1075    }
1076
1077    if (preserveAll) {
1078      nodes.add(HtmlDocument.createHtmlComment(html.substring(start, pos)));
1079    }
1080
1081    return pos;
1082  }
1083
1084  //------------------------------------------------------------------------
1085  // CDATA scanning
1086  //------------------------------------------------------------------------
1087  int scanCDATA(final int start, final int end) {
1088
1089    // Get the tag: must be either STYLE or SCRIPT
1090    HtmlDocument.Tag tag = (HtmlDocument.Tag) nodes.get(nodes.size() - 1);
1091    HTML.Element element = tag.getElement();
1092    X.assertTrue(HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element));
1093
1094    int pos;
1095    for (pos = start; pos < end; pos++) {
1096      if (pos + 2 < end &&
1097          html.charAt(pos) == '<' &&
1098          html.charAt(pos + 1) == '/' &&
1099          html.regionMatches(true, pos + 2, element.getName(), 0,
1100                              element.getName().length())) {
1101        break;
1102      }
1103    }
1104
1105    // Add a CDATA node
1106    if (pos > start) {
1107      HtmlDocument.CDATA cdata =
1108        HtmlDocument.createCDATA(html.substring(start, pos));
1109      nodes.add(cdata);
1110    }
1111
1112    state = State.IN_TAG;
1113    return pos;
1114  }
1115
1116  //------------------------------------------------------------------------
1117  public static void main(String[] args) throws IOException {
1118
1119    DEBUG = true;
1120
1121    String html = new String(ByteStreams.toByteArray(System.in), "ISO-8859-1");
1122
1123    HtmlParser parser = new HtmlParser();
1124    HtmlDocument doc = parser.parse(html);
1125    System.out.println(doc.toString());
1126  }
1127
1128  private static void debug(String str) {
1129    System.err.println(str);
1130  }
1131}