EntityResolver.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.util;
18
19import com.google.common.base.Preconditions;
20import com.google.common.collect.ImmutableMap;
21
22import java.util.Map;
23
24/**
25 * <p>Decodes (unescapes) HTML entities with the complication that these
26 * are received one character at a time hence must be stored temporarily.
27 * Also, we may receive some "junk" characters before the actual
28 * entity which we will discard.
29 *
30 * <p>This class is designed to be 100% compatible with the corresponding
31 * logic in the C-version of the
32 * {@link com.google.security.streamhtmlparser.HtmlParser}, found
33 * in <code>htmlparser.c</code>. There are however a few intentional
34 * differences outlines below:
35 * <ul>
36 *   <li>We accept lower and upper-case hex NCRs, the C-version
37 *       accepts only lower-case ones.
38 *   <li>The output on some invalid inputs may be different. This is
39 *       currently in the process of consolidation with Filipe.
40 *   <li>The API is a bit different, I find this one better suited
41 *       for Java. In particular, the C method <code>processChar</code>
42 *       returns the output {@code String} whereas in Java, we return
43 *       a status code and then provide the {@code String} in a separate
44 *       method <code>getEntity</code>. It is cleaner as it avoids the
45 *       need to return empty {@code String}s during incomplete processing.
46 * </ul>
47 *
48 * <p>Valid HTML entities have one of the following three forms:
49 * <ul>
50 *   <li><code>&amp;dd;</code> where dd is a number in decimal (base 10) form.
51 *   <li><code>&amp;x|Xyy;</code> where yy is a hex-number (base 16).
52 *   <li><code>&&lt;html-entity&gt;;</code> where
53 *       <code>&lt;html-entity&gt;</code> is one of <code>lt</code>,
54 *       <code>gt</code>, <code>amp</code>, <code>quot</code> or
55 *       <code>apos</code>.
56 * </ul>
57 *
58 * <p>A <code>reset</code> method is provided to facilitate object re-use.
59 */
60public class EntityResolver {
61
62  /**
63   * Returned in <code>processChar</code> method.
64   * <p>
65   * <ul>
66   *   <li><code>NOT_STARTED</code> indicates we are still processing
67   *       trailing characters before the start of an entity.
68   *       The caller may want to save the characters it provided us.
69   *   <li><code>IN_PROGRESS</code> indicates we are currently processing
70   *       characters part of an entity.
71   *   <li><code>COMPLETED</code> indicates we have finished processing
72   *       an entity. The caller can then invoke <code>getEntity</code>
73   *       then re-set the object for future re-use.
74   * </ul>
75   */
76  public enum Status {
77    NOT_STARTED("Not Started"),
78    IN_PROGRESS("In Progress"),
79    COMPLETED("Completed");
80
81    private final String message;
82
83    private Status(String message) {
84      this.message = message;
85    }
86
87    /**
88     * Returns a brief description of the {@code Status} for
89     * debugging purposes. The format of the returned {@code String}
90     * is not fully specified nor guaranteed to remain the same.
91     */
92    @Override
93    public String toString() {
94      return message;
95    }
96  }
97
98  /**
99   * How many characters to store as we are processing an entity. Once we
100   * reach that size, we know the entity is definitely invalid. The size
101   * is higher than needed but keeping it as-is for compatibility with
102   * the C-version.
103   */
104  private static final int MAX_ENTITY_SIZE = 10;
105
106  /**
107   * Map containing the recognized HTML entities and their decoded values.
108   * The trailing ';' is not included in the key but it is accounted for.
109   */
110  private static final Map<String, String> HTML_ENTITIES_MAP =
111      new ImmutableMap.Builder<String, String>()
112          .put("&lt", "<")
113          .put("&gt", ">")
114          .put("&amp", "&")
115          .put("&apos", "'")
116          .build();
117
118  /** Storage for received until characters until an HTML entity is complete. */
119  private final StringBuilder sb;
120
121  /**
122   * Indicates the state we are in. see {@link EntityResolver.Status}.
123   */
124  private Status status;
125  private String entity;
126
127  /**
128   * Constructs an entity resolver that is initially empty and
129   * with status {@code NOT_STARTED}, see {@link EntityResolver.Status}.
130   *
131   */
132  public EntityResolver() {
133    sb = new StringBuilder();
134    status = Status.NOT_STARTED;
135    entity = "";
136  }
137
138  /**
139   * Constructs an entity resolver that is an exact copy of
140   * the one provided. In particular it has the same contents
141   * and status.
142   *
143   * @param aEntityResolver the entity resolver to copy
144   */
145  public EntityResolver(EntityResolver aEntityResolver) {
146    sb = new StringBuilder();
147    sb.replace(0, sb.length(), aEntityResolver.sb.toString());
148    entity = aEntityResolver.entity;
149    status = aEntityResolver.status;
150  }
151
152  /**
153   * Returns the object to its original state for re-use, deleting any
154   * stored characters that may be present.
155   */
156  public void reset() {
157    status = Status.NOT_STARTED;
158    sb.setLength(0);
159    entity = "";
160  }
161
162  /**
163   * Returns the full state of the <code>StreamEntityResolver</code>
164   * in a human readable form. The format of the returned <code>String</code>
165   * is not specified and is subject to change.
166   *
167   * @return full state of this object
168   */
169  @Override
170  public String toString() {
171    return String.format("Status: %s; Contents (%d): %s", status.toString(),
172                         sb.length(), sb.toString());
173  }
174
175  /**
176   * Returns the decoded HTML Entity. Should only be called
177   * after {@code processChar} returned status {@code COMPLETED}.
178   *
179   * @return the decoded HTML Entity or an empty {@code String} if
180   *         we were called with any status other than {@code COMPLETED}
181   */
182  public String getEntity() {
183    return entity;
184  }
185
186  /**
187   * Processes a character from the input stream and decodes any html entities
188   * from that processed input stream.
189   *
190   * @param input the {@code char} to process
191   * @return the processed {@code String}. Typically returns an empty
192   *         {@code String} while awaiting for more characters to complete
193   *         processing of the entity.
194   */
195  public Status processChar(char input) {
196    // Developer error if the precondition fails.
197    Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0);
198    if (status == Status.NOT_STARTED) {
199      if (input == '&') {
200        sb.append(input);
201        status = Status.IN_PROGRESS;
202      }
203    } else if (status == Status.IN_PROGRESS) {
204      if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) {
205        status = Status.COMPLETED;
206        entity = convertEntity(input);
207      } else {
208        if (sb.length() < MAX_ENTITY_SIZE) {
209          sb.append(input);
210        } else {
211          status = Status.COMPLETED;
212          entity = uncovertedInput(input);
213        }
214      }
215    } else {
216      // Status.COMPLETED, ignore character, do nothing.
217    }
218    return status;
219  }
220
221  /**
222   * Performs the decoding of a complete HTML entity and saves the
223   * result back into the buffer.
224   * <a href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1">
225   * Numeric Character References</a>
226   *
227   * @param terminator the last character read, unused on successful
228   *        conversions since it is the end delimiter of the entity
229   * @return The decoded entity or the original input if we could not decode it.
230   */
231  private String convertEntity(char terminator) {
232    // Developer error if the buffer was empty or does not start with '&'.
233    Preconditions.checkArgument(sb.length() > 0);
234    Preconditions.checkArgument(sb.charAt(0) == '&');
235
236    if (sb.length() > 1) {
237      if (sb.charAt(1) == '#') {
238        if (sb.length() <= 2) {    // Error => return content as-is.
239          return uncovertedInput(terminator);
240        }
241        try {
242          if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) {    // Hex NCR
243            return new String(Character.toChars(
244                Integer.parseInt(sb.substring(3), 16)));
245          } else {                                              // Decimal NCR
246            return new String(Character.toChars(
247                Integer.parseInt(sb.substring(2))));
248          }
249        } catch (NumberFormatException e) {
250          return uncovertedInput(terminator);
251        }
252      }
253
254      // See if it matches any of the few recognized entities.
255      String key = sb.toString();
256      if (HTML_ENTITIES_MAP.containsKey(key)) {
257        return HTML_ENTITIES_MAP.get(key);
258      }
259    }
260    // Covers the case of a lonely '&' given or valid/invalid unknown entities.
261    return uncovertedInput(terminator);
262  }
263
264  private String uncovertedInput(char terminator) {
265    return String.format("%s%c", sb.toString(), terminator);
266  }
267}
268