156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/*
256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc.
356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License");
556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License.
656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at
756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0
956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software
1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS,
1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and
1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License.
1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser;
1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.impl.HtmlParserImpl;
2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.Set;
2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.logging.Logger;
2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/**
2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A factory class to obtain instances of an {@link HtmlParser}.
2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Currently each instance is a new object given these are fairly
2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * light-weight.
2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson *
2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>In the unlikely case that this class fails to initialize properly
3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * (a developer error), an error is emitted to the error console and the logs
3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * and the specialized parser creation methods will throw
3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * an {@link AssertionError} on all invokations.
3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */
3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic class HtmlParserFactory {
3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final Logger logger =
3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      Logger.getLogger(HtmlParserFactory.class.getName());
3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * To provide additional options when creating an {@code HtmlParser} using
4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE,
4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *        boolean, Set)}
4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public enum AttributeOptions {
4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    /**
4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * Indicates that the attribute value is Javascript-quoted. Only takes
4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * effect for Javascript-accepting attributes - as identified by
4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also
5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * HTML quoted.
5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     */
5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    JS_QUOTED,
5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    /**
5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * Indicates the attribute value is only a part of a URL as opposed to a
5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * full URL. In particular, the value is not at the start of a URL and
5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * hence does not necessitate validation of the URL scheme.
5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * Only valid for URI-accepting attributes - as identified by
5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * {@link HtmlParser.ATTR_TYPE#URI}.
6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     */
6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    URL_PARTIAL,
6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * To provide additional options when creating an {@code HtmlParser} using
6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)}
6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public enum ModeOptions {
6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    /**
7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * Indicates that the parser is inside a quoted {@code String}. Only
7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     * valid in the {@link HtmlParser.Mode#JS} mode.
7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson     */
7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    JS_QUOTED
7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInDefaultAttr = createParser();
7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInDefaultAttrQ = createParser();
7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInUriAttrComplete = createParser();
8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInUriAttrQComplete = createParser();
8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInUriAttrPartial = createParser();
8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInUriAttrQPartial = createParser();
8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInJsAttr = createParser();
8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInJsAttrQ = createParser();
8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInQJsAttr = createParser();
8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInStyleAttr = createParser();
8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInStyleAttrQ = createParser();
8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static final HtmlParser parserInJsQ = createParser();
8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Protects all the createParserXXX methods by throwing a run-time exception
9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * if this class failed to initialize properly.
9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static boolean initSuccess = false;
9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  static {
9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    try {
9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      initializeParsers();
9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      initSuccess = true;
10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    } catch (ParseException e) {
10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      // Log a severe error and print it to stderr along with a stack trace.
10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      String error = HtmlParserFactory.class.getName() +
10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                     " Failed initialization: " + e.getMessage();
10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      logger.severe(error);
10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      System.err.println(error);
10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      e.printStackTrace();
10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  // Static class.
11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private HtmlParserFactory() {
11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }  // COV_NF_LINE
11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns an {@code HtmlParser} object ready to parse HTML input.
11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return an {@code HtmlParser} in the provided mode
11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static HtmlParser createParser() {
12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return new HtmlParserImpl();
12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns an {@code HtmlParser} object initialized with the
12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * requested Mode. Provide non {@code null} options to provide
12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * a more precise initialization with the desired Mode.
12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param mode the mode to reset the parser with
12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param options additional options or {@code null} for none
13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return an {@code HtmlParser} in the provided mode
13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws AssertionError when this class failed to initialize
13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static HtmlParser createParserInMode(HtmlParser.Mode mode,
13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson                                              Set<ModeOptions> options) {
13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    requireInitialized();
13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (options != null && options.contains(ModeOptions.JS_QUOTED))
13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      return createParser(parserInJsQ);
13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // With no options given, this method is just a convenience wrapper for
14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // the two calls below.
14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    HtmlParser parser = new HtmlParserImpl();
14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parser.resetMode(mode);
14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return parser;
14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Returns an {@code HtmlParser} that is a copy of the one
14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * supplied. It holds the same internal state and hence can
15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * proceed with parsing in-lieu of the supplied parser.
15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param aHtmlParser a {@code HtmlParser} to copy from
15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return an {@code HtmlParser} that is a copy of the provided one
15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws AssertionError when this class failed to initialize
15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static HtmlParser createParser(HtmlParser aHtmlParser) {
15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    requireInitialized();
15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Should never get a ClassCastException since there is only one
16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // implementation of the HtmlParser interface.
16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return new HtmlParserImpl((HtmlParserImpl) aHtmlParser);
16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * A very specialized {@code HtmlParser} accessor that returns a parser
16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * in a state where it expects to read the value of an attribute
16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * of an HTML tag. This is only useful when the parser has not seen a
16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * certain HTML tag and an attribute name and needs to continue parsing
16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * from a state as though it has.
17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>For example, to create a parser in a state akin to that
17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * after the parser has parsed "&lt;a href=\"", invoke:
17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <pre>
17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *   createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </pre>
17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>You must provide the proper value of quoting or the parser
17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * will go into an unexpected state.
17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE}
18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state
18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * inside an HTML tag where it expects an attribute name not an attribute
18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * value. It becomes equivalent to a parser initialized in the
18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * {@code HTML_IN_TAG} mode.
18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param attrtype the attribute type which the parser should be in
18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param quoted whether the attribute value is enclosed in double quotes
18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @param options additional options or {@code null} for none
18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @return an {@code HtmlParser} initialized in the given attribute type
18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *         and quoting
19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws AssertionError when this class failed to initialize
19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  public static HtmlParser createParserInAttribute(
19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      HtmlParser.ATTR_TYPE attrtype,
19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      boolean quoted, Set<AttributeOptions> options) {
19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    requireInitialized();
19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    HtmlParser parser;
19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    switch (attrtype) {
19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case REGULAR:
20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        parser = createParser(
20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            quoted ? parserInDefaultAttrQ : parserInDefaultAttr);
20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case URI:
20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        if (options != null && options.contains(AttributeOptions.URL_PARTIAL))
20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          parser = createParser(
20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson              quoted ? parserInUriAttrQPartial : parserInUriAttrPartial);
20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        else
20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          parser = createParser(
20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson              quoted ? parserInUriAttrQComplete : parserInUriAttrComplete);
21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case JS:
21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        // Note: We currently do not support the case of the value being
21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        // inside a Javascript quoted string that is in an unquoted HTML
21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        // attribute, such as <a href=bla onmouseover=alert('[VALUE')>.
21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        // It would be simple to add but currently we assume Javascript
21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        // quoted attribute values are always HTML quoted.
21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        if (quoted) {
21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          if (options != null && options.contains(AttributeOptions.JS_QUOTED))
21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            parser = createParser(parserInQJsAttr);
22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          else
22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            parser = createParser(parserInJsAttrQ);
22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        } else {
22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson          parser = createParser(parserInJsAttr);
22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        }
22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case STYLE:
22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        parser = createParser(
22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            quoted ? parserInStyleAttrQ : parserInStyleAttr);
22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      case NONE:
23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null);
23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        break;
23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      default:
23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson        throw new IllegalArgumentException(
23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson            "Did not recognize ATTR_TYPE given: " + attrtype);
23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    }
23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    return parser;
23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Initializes a set of static parsers to be subsequently used
24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * by the various createParserXXX methods.
24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * The parsers are set to their proper states by making them parse
24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * an appropriate HTML input fragment. This approach is the most likely
24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * to ensure all their internal state is consistent.
24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>In the very unexpected case of the parsing failing (developer error),
24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * this class will fail to initialize properly.
24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <p>In addition:
25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <ul>
25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The HTML tag is set to a fictitious name {@code xparsertag}.
25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>The attribute name is chosen to match the required attribute type.
25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *     When several possibilities exist, one is chosen arbitrarily.
25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * <li>If quoting is required, a double quote is provided after the '='.
25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * </ul>
25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws ParseException if parsing failed.
25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void initializeParsers() throws ParseException {
26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInDefaultAttr.parse("<xparsertag htmlparser=");
26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInDefaultAttrQ.parse("<xparsertag htmlparser=\"");
26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Chosing the "src" attribute, one of several possible names here
26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInUriAttrComplete.parse("<xparsertag src=");
26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInUriAttrQComplete.parse("<xparsertag src=\"");
26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // To support a parser that is initialized within a URL parameter
26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // rather than at the beginning of a URL. We use a fake domain
27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>)
27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // and a fake query parameter.
27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    final String fakeUrlPrefix = "http://example.com/fakequeryparam=";
27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix);
27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix);
27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Using onmouse= which is a fictitious attribute name that the parser
27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // understands as being a valid javascript-enabled attribute. Chosing fake
27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // names may help during debugging.
27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInJsAttr.parse("<xparsertag onmouse=");
28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInJsAttrQ.parse("<xparsertag onmouse=\"");
28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Single quote added as the Javascript is itself quoted.
28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInQJsAttr.parse("<xparsertag onmouse=\"'");
28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
28456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // A parser in the Javascript context within a (single) quoted string.
28556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInJsQ.resetMode(HtmlParser.Mode.JS);
28656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInJsQ.parse("var fakeparservar='");
28756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
28856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    // Chosing the "style" attribute as it is the only option
28956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInStyleAttr.parse("<xparsertag style=");
29056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    parserInStyleAttrQ.parse("<xparsertag style=\"");
29156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
29256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson
29356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  /**
29456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * Throws an {@link AssertionError} if the class was not initialized
29556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * correctly, otherwise simply returns. This is to protect against the
29656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * possibility the needed parsers were not created successfully during
29756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * static initialized, which can only happen due to an error during
29856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * development of this library.
29956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   *
30056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   * @throws AssertionError when this class failed to initialize
30156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson   */
30256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  private static void requireInitialized() {
30356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson    if (!initSuccess)
30456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson      throw new AssertionError("HtmlParserFactory failed initialization.");
30556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson  }
30656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson}
307