GenericParser.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.impl;
18
19import com.google.common.base.Preconditions;
20import com.google.streamhtmlparser.ExternalState;
21import com.google.streamhtmlparser.Parser;
22import com.google.streamhtmlparser.ParseException;
23import com.google.streamhtmlparser.util.HtmlUtils;
24
25import java.util.Map;
26
27/**
28 * An implementation of the {@code Parser} interface that is common to both
29 * {@code HtmlParser} and {@code JavascriptParser}.
30 *
31 * <p>Provides methods for parsing input and ensuring that all in-state,
32 * entering-a-state and exiting-a-state callbacks are invoked as appropriate.
33 *
34 * <p>This class started as abstract but it was found better for testing to
35 * make it instantiatable so that the parsing logic can be tested with dummy
36 * state transitions.
37 */
38public class GenericParser implements Parser {
39
40  protected final ParserStateTable parserStateTable;
41  protected final Map<InternalState, ExternalState> intToExtStateTable;
42  protected final InternalState initialState;
43  protected InternalState currentState;
44  protected int lineNumber;
45  protected int columnNumber;
46
47  protected GenericParser(ParserStateTable parserStateTable,
48                          Map<InternalState, ExternalState> intToExtStateTable,
49                          InternalState initialState) {
50    this.parserStateTable = parserStateTable;
51    this.intToExtStateTable = intToExtStateTable;
52    this.initialState = initialState;
53    this.currentState = initialState;
54    this.lineNumber = 1;
55    this.columnNumber = 1;
56  }
57
58  /**
59   * Constructs a generic parser that is an exact copy of the
60   * one given. Note that here too, data structures that do not
61   * change are shallow-copied (parser state table and state mappings).
62   *
63   * @param aGenericParser the {@code GenericParser} to copy
64   */
65  protected GenericParser(GenericParser aGenericParser) {
66    parserStateTable = aGenericParser.parserStateTable;
67    intToExtStateTable = aGenericParser.intToExtStateTable;
68    initialState = aGenericParser.initialState;
69    currentState = aGenericParser.currentState;
70    lineNumber = aGenericParser.lineNumber;
71    columnNumber = aGenericParser.columnNumber;
72  }
73
74  /**
75   * Tell the parser to process the provided {@code String}. This is just a
76   * convenience method that wraps over {@link Parser#parse(char)}.
77   * @param input the {@code String} to parse
78   * @throws ParseException if an unrecoverable error occurred during parsing
79   */
80  @Override
81  public void parse(String input) throws ParseException {
82    for (int i = 0; i < input.length(); i++)
83      parse(input.charAt(i));
84  }
85
86  /**
87   * Main loop for parsing of input.
88   *
89   * <p>Absent any callbacks defined, this function simply determines the
90   * next state to switch to based on the <code>ParserStateTable</code> which is
91   * derived from a state-machine configuration file in the original C++ parser.
92   *
93   * <p>However some states have specific callbacks defined which when
94   * receiving specific characters may decide to overwrite the next state to
95   * go to. Hence the next state is a function both of the main state table
96   * in {@code ParserStateTable} as well as specific run-time information
97   * from the callback functions.
98   *
99   * <p>Also note that the callbacks are called in a proper sequence,
100   * first the exit-state one then the enter-state one and finally the
101   * in-state one. Changing the order may result in a functional change.
102   *
103   * @param input the input character to parse (process)
104   * @throws ParseException if an unrecoverable error occurred during parsing
105   */
106  @Override
107  public void parse(char input) throws ParseException {
108    InternalState nextState =
109        parserStateTable.getNextState(currentState, input);
110
111    if (nextState == InternalState.INTERNAL_ERROR_STATE) {
112        String errorMsg =
113            String.format("Unexpected character '%s' in int_state '%s' " +
114                          "(ext_state '%s')",
115                          HtmlUtils.encodeCharForAscii(input),
116                          currentState.getName(), getState().getName());
117      currentState = InternalState.INTERNAL_ERROR_STATE;
118      throw new ParseException(this, errorMsg);
119    }
120
121    if (currentState != nextState) {
122      nextState = handleExitState(currentState, nextState, input);
123    }
124    if (currentState != nextState) {
125      nextState = handleEnterState(nextState, nextState, input);
126    }
127    nextState = handleInState(nextState, input);
128    currentState = nextState;
129    record(input);
130
131    columnNumber++;
132    if (input == '\n') {
133      lineNumber++;
134      columnNumber = 1;
135    }
136  }
137
138  /**
139   * Return the current state of the parser.
140   */
141  @Override
142  public ExternalState getState() {
143    if (!intToExtStateTable.containsKey(currentState)) {
144      throw new NullPointerException("Did not find external state mapping " +
145                                     "For internal state: " + currentState);
146    }
147    return intToExtStateTable.get(currentState);
148  }
149
150  /**
151   * Reset the parser back to its initial default state.
152   */
153  @Override
154  public void reset() {
155    currentState = initialState;
156    lineNumber = 1;
157    columnNumber = 1;
158  }
159
160  /**
161   * Sets the current line number which is returned during error messages.
162   */
163  @Override
164  public void setLineNumber(int lineNumber) {
165    this.lineNumber = lineNumber;
166  }
167
168  /**
169   * Returns the current line number.
170   */
171  @Override
172  public int getLineNumber() {
173    return lineNumber;
174  }
175
176  /**
177   * Sets the current column number which is returned during error messages.
178   */
179  @Override
180  public void setColumnNumber(int columnNumber) {
181    this.columnNumber = columnNumber;
182  }
183
184  /**
185   * Returns the current column number.
186   */
187  @Override
188  public int getColumnNumber() {
189    return columnNumber;
190  }
191
192  InternalState getCurrentInternalState() {
193    return currentState;
194  }
195
196  protected void setNextState(InternalState nextState) throws ParseException {
197    Preconditions.checkNotNull(nextState);   // Developer error if it triggers.
198
199    /* We are not actually parsing hence providing
200     * a null char to the event handlers.
201     */
202    // TODO: Complicated logic to follow in C++ but clean it up.
203    final char nullChar = '\0';
204
205    if (currentState != nextState) {
206      nextState = handleExitState(currentState, nextState, nullChar);
207    }
208    if (currentState != nextState) {
209      handleEnterState(nextState, nextState, nullChar);
210    }
211    currentState = nextState;
212  }
213
214  /**
215   * Invoked when the parser enters a new state.
216   *
217   * @param currentState the current state of the parser
218   * @param expectedNextState the next state according to the
219   *        state table definition
220   * @param input the last character parsed
221   * @return the state to change to, could be the same as the
222   *         {@code expectedNextState} provided
223   * @throws ParseException if an unrecoverable error occurred during parsing
224   */
225  protected InternalState handleEnterState(InternalState currentState,
226                                           InternalState expectedNextState,
227                                           char input) throws ParseException {
228    return expectedNextState;
229  }
230
231  /**
232   * Invoked when the parser exits a state.
233   *
234   * @param currentState the current state of the parser
235   * @param expectedNextState the next state according to the
236   *        state table definition
237   * @param input the last character parsed
238   * @return the state to change to, could be the same as the
239   *         {@code expectedNextState} provided
240   * @throws ParseException if an unrecoverable error occurred during parsing
241   */
242  protected InternalState handleExitState(InternalState currentState,
243                                          InternalState expectedNextState,
244                                          char input) throws ParseException {
245    return expectedNextState;
246  }
247
248  /**
249   * Invoked for each character read when no state change occured.
250   *
251   * @param currentState the current state of the parser
252   * @param input the last character parsed
253   * @return the state to change to, could be the same as the
254   *         {@code expectedNextState} provided
255   * @throws ParseException if an unrecoverable error occurred during parsing
256   */
257  protected InternalState handleInState(InternalState currentState,
258                                        char input) throws ParseException {
259    return currentState;
260  }
261
262  /**
263   * Perform some processing on the given character. Derived classes
264   * may override this method in order to perform additional logic
265   * on every processed character beyond the logic defined in
266   * state transitions.
267   *
268   * @param input the input character to operate on
269   */
270  protected void record(char input) { }
271}
272