1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser;
18
19import com.google.streamhtmlparser.impl.HtmlParserImpl;
20
21import java.util.Set;
22import java.util.logging.Logger;
23
24/**
25 * A factory class to obtain instances of an {@link HtmlParser}.
26 * Currently each instance is a new object given these are fairly
27 * light-weight.
28 *
29 * <p>In the unlikely case that this class fails to initialize properly
30 * (a developer error), an error is emitted to the error console and the logs
31 * and the specialized parser creation methods will throw
32 * an {@link AssertionError} on all invokations.
33 */
34public class HtmlParserFactory {
35
36  private static final Logger logger =
37      Logger.getLogger(HtmlParserFactory.class.getName());
38
39  /**
40   * To provide additional options when creating an {@code HtmlParser} using
41   * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE,
42   *        boolean, Set)}
43   */
44  public enum AttributeOptions {
45
46    /**
47     * Indicates that the attribute value is Javascript-quoted. Only takes
48     * effect for Javascript-accepting attributes - as identified by
49     * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also
50     * HTML quoted.
51     */
52    JS_QUOTED,
53
54    /**
55     * Indicates the attribute value is only a part of a URL as opposed to a
56     * full URL. In particular, the value is not at the start of a URL and
57     * hence does not necessitate validation of the URL scheme.
58     * Only valid for URI-accepting attributes - as identified by
59     * {@link HtmlParser.ATTR_TYPE#URI}.
60     */
61    URL_PARTIAL,
62  }
63
64  /**
65   * To provide additional options when creating an {@code HtmlParser} using
66   * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)}
67   */
68  public enum ModeOptions {
69
70    /**
71     * Indicates that the parser is inside a quoted {@code String}. Only
72     * valid in the {@link HtmlParser.Mode#JS} mode.
73     */
74    JS_QUOTED
75  }
76
77  private static final HtmlParser parserInDefaultAttr = createParser();
78  private static final HtmlParser parserInDefaultAttrQ = createParser();
79  private static final HtmlParser parserInUriAttrComplete = createParser();
80  private static final HtmlParser parserInUriAttrQComplete = createParser();
81  private static final HtmlParser parserInUriAttrPartial = createParser();
82  private static final HtmlParser parserInUriAttrQPartial = createParser();
83  private static final HtmlParser parserInJsAttr = createParser();
84  private static final HtmlParser parserInJsAttrQ = createParser();
85  private static final HtmlParser parserInQJsAttr = createParser();
86  private static final HtmlParser parserInStyleAttr = createParser();
87  private static final HtmlParser parserInStyleAttrQ = createParser();
88  private static final HtmlParser parserInJsQ = createParser();
89
90  /**
91   * Protects all the createParserXXX methods by throwing a run-time exception
92   * if this class failed to initialize properly.
93   */
94  private static boolean initSuccess = false;
95
96  static {
97    try {
98      initializeParsers();
99      initSuccess = true;
100    } catch (ParseException e) {
101      // Log a severe error and print it to stderr along with a stack trace.
102      String error = HtmlParserFactory.class.getName() +
103                     " Failed initialization: " + e.getMessage();
104      logger.severe(error);
105      System.err.println(error);
106      e.printStackTrace();
107    }
108  }
109
110  // Static class.
111  private HtmlParserFactory() {
112  }  // COV_NF_LINE
113
114  /**
115   * Returns an {@code HtmlParser} object ready to parse HTML input.
116   *
117   * @return an {@code HtmlParser} in the provided mode
118   */
119  public static HtmlParser createParser() {
120    return new HtmlParserImpl();
121  }
122
123  /**
124   * Returns an {@code HtmlParser} object initialized with the
125   * requested Mode. Provide non {@code null} options to provide
126   * a more precise initialization with the desired Mode.
127   *
128   * @param mode the mode to reset the parser with
129   * @param options additional options or {@code null} for none
130   * @return an {@code HtmlParser} in the provided mode
131   * @throws AssertionError when this class failed to initialize
132   */
133  public static HtmlParser createParserInMode(HtmlParser.Mode mode,
134                                              Set<ModeOptions> options) {
135    requireInitialized();
136
137    if (options != null && options.contains(ModeOptions.JS_QUOTED))
138      return createParser(parserInJsQ);
139
140    // With no options given, this method is just a convenience wrapper for
141    // the two calls below.
142    HtmlParser parser = new HtmlParserImpl();
143    parser.resetMode(mode);
144    return parser;
145  }
146
147  /**
148   * Returns an {@code HtmlParser} that is a copy of the one
149   * supplied. It holds the same internal state and hence can
150   * proceed with parsing in-lieu of the supplied parser.
151   *
152   * @param aHtmlParser a {@code HtmlParser} to copy from
153   * @return an {@code HtmlParser} that is a copy of the provided one
154   * @throws AssertionError when this class failed to initialize
155   */
156  public static HtmlParser createParser(HtmlParser aHtmlParser) {
157    requireInitialized();
158
159    // Should never get a ClassCastException since there is only one
160    // implementation of the HtmlParser interface.
161    return new HtmlParserImpl((HtmlParserImpl) aHtmlParser);
162  }
163
164  /**
165   * A very specialized {@code HtmlParser} accessor that returns a parser
166   * in a state where it expects to read the value of an attribute
167   * of an HTML tag. This is only useful when the parser has not seen a
168   * certain HTML tag and an attribute name and needs to continue parsing
169   * from a state as though it has.
170   *
171   * <p>For example, to create a parser in a state akin to that
172   * after the parser has parsed "&lt;a href=\"", invoke:
173   * <pre>
174   *   createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
175   * </pre>
176   *
177   * <p>You must provide the proper value of quoting or the parser
178   * will go into an unexpected state.
179   * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE}
180   * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state
181   * inside an HTML tag where it expects an attribute name not an attribute
182   * value. It becomes equivalent to a parser initialized in the
183   * {@code HTML_IN_TAG} mode.
184   *
185   * @param attrtype the attribute type which the parser should be in
186   * @param quoted whether the attribute value is enclosed in double quotes
187   * @param options additional options or {@code null} for none
188   * @return an {@code HtmlParser} initialized in the given attribute type
189   *         and quoting
190   * @throws AssertionError when this class failed to initialize
191   */
192  public static HtmlParser createParserInAttribute(
193      HtmlParser.ATTR_TYPE attrtype,
194      boolean quoted, Set<AttributeOptions> options) {
195    requireInitialized();
196
197    HtmlParser parser;
198    switch (attrtype) {
199      case REGULAR:
200        parser = createParser(
201            quoted ? parserInDefaultAttrQ : parserInDefaultAttr);
202        break;
203      case URI:
204        if (options != null && options.contains(AttributeOptions.URL_PARTIAL))
205          parser = createParser(
206              quoted ? parserInUriAttrQPartial : parserInUriAttrPartial);
207        else
208          parser = createParser(
209              quoted ? parserInUriAttrQComplete : parserInUriAttrComplete);
210        break;
211      case JS:
212        // Note: We currently do not support the case of the value being
213        // inside a Javascript quoted string that is in an unquoted HTML
214        // attribute, such as <a href=bla onmouseover=alert('[VALUE')>.
215        // It would be simple to add but currently we assume Javascript
216        // quoted attribute values are always HTML quoted.
217        if (quoted) {
218          if (options != null && options.contains(AttributeOptions.JS_QUOTED))
219            parser = createParser(parserInQJsAttr);
220          else
221            parser = createParser(parserInJsAttrQ);
222        } else {
223          parser = createParser(parserInJsAttr);
224        }
225        break;
226      case STYLE:
227        parser = createParser(
228            quoted ? parserInStyleAttrQ : parserInStyleAttr);
229        break;
230      case NONE:
231        parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null);
232        break;
233      default:
234        throw new IllegalArgumentException(
235            "Did not recognize ATTR_TYPE given: " + attrtype);
236    }
237    return parser;
238  }
239
240  /**
241   * Initializes a set of static parsers to be subsequently used
242   * by the various createParserXXX methods.
243   * The parsers are set to their proper states by making them parse
244   * an appropriate HTML input fragment. This approach is the most likely
245   * to ensure all their internal state is consistent.
246   *
247   * <p>In the very unexpected case of the parsing failing (developer error),
248   * this class will fail to initialize properly.
249   *
250   * <p>In addition:
251   * <ul>
252   * <li>The HTML tag is set to a fictitious name {@code xparsertag}.
253   * <li>The attribute name is chosen to match the required attribute type.
254   *     When several possibilities exist, one is chosen arbitrarily.
255   * <li>If quoting is required, a double quote is provided after the '='.
256   * </ul>
257   *
258   * @throws ParseException if parsing failed.
259   */
260  private static void initializeParsers() throws ParseException {
261    parserInDefaultAttr.parse("<xparsertag htmlparser=");
262    parserInDefaultAttrQ.parse("<xparsertag htmlparser=\"");
263
264    // Chosing the "src" attribute, one of several possible names here
265    parserInUriAttrComplete.parse("<xparsertag src=");
266    parserInUriAttrQComplete.parse("<xparsertag src=\"");
267
268    // To support a parser that is initialized within a URL parameter
269    // rather than at the beginning of a URL. We use a fake domain
270    // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>)
271    // and a fake query parameter.
272    final String fakeUrlPrefix = "http://example.com/fakequeryparam=";
273    parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix);
274    parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix);
275
276    // Using onmouse= which is a fictitious attribute name that the parser
277    // understands as being a valid javascript-enabled attribute. Chosing fake
278    // names may help during debugging.
279    parserInJsAttr.parse("<xparsertag onmouse=");
280    parserInJsAttrQ.parse("<xparsertag onmouse=\"");
281    // Single quote added as the Javascript is itself quoted.
282    parserInQJsAttr.parse("<xparsertag onmouse=\"'");
283
284    // A parser in the Javascript context within a (single) quoted string.
285    parserInJsQ.resetMode(HtmlParser.Mode.JS);
286    parserInJsQ.parse("var fakeparservar='");
287
288    // Chosing the "style" attribute as it is the only option
289    parserInStyleAttr.parse("<xparsertag style=");
290    parserInStyleAttrQ.parse("<xparsertag style=\"");
291  }
292
293  /**
294   * Throws an {@link AssertionError} if the class was not initialized
295   * correctly, otherwise simply returns. This is to protect against the
296   * possibility the needed parsers were not created successfully during
297   * static initialized, which can only happen due to an error during
298   * development of this library.
299   *
300   * @throws AssertionError when this class failed to initialize
301   */
302  private static void requireInitialized() {
303    if (!initSuccess)
304      throw new AssertionError("HtmlParserFactory failed initialization.");
305  }
306}
307