ParserStateTable.java revision 56ed4167b942ec265f9cee70ac4d71d10b3835ce
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.streamhtmlparser.impl;
18
19import com.google.common.base.Preconditions;
20
21/**
22 * Holds a state table which is defined as the set of all
23 * recognized state transitions and the set of characters that
24 * trigger them.
25 *
26 * <p>The logic of what character causes what state transition is derived from
27 * a base definition written as a Python configuration file in the original
28 * C++ parser.
29 *
30 * <p>This class provides methods to initially build the state table and then
31 * methods at parsing time to determine the transitions to subsequent states.
32 *
33 * <p>Note on characters outside the extended ASCII range: Currently, all state
34 * transitions in the Python configuration file trigger only on extended
35 * ASCII characters, that is characters in the Unicode space of [U+0000 to
36 * U+00FF]. We use that property to design a more efficient state transition
37 * representation. When receiving characters outside that ASCII range, we
38 * simply apply the DEFAULT transition for the given state - as we do for any
39 * character that is not a hot character for that state. If no default
40 * transition exists, we switch to the Internal Error state.
41 *
42 * <p>Technical note: In Java, a {@code char} is a code unit and in some cases
43 * may not represent a complete Unicode code point. However, when that happens,
44 * the code units that follow for that code point are all in the surrogate area
45 * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
46 * any incorrect state transitions.
47 *
48 * <p>This class is storage-inefficient but it is static at least
49 * and not generated for each Parser instance.
50 */
51class ParserStateTable {
52
53  /**
54   * A limit on how many different states we can have in one state table.
55   * Can be increased should it no longer be sufficient.
56   */
57  private static final int MAX_STATES = 256;
58
59  /**
60   * We only check transitions for (extended) ASCII characters, hence
61   * characters in the range 0 to MAX_CHARS -1.
62   */
63  private static final int MAX_CHARS = 256;
64
65  /**
66   * Records all state transitions except those identified as DEFAULT
67   * transitions. It is two dimensional: Stores a target {@code InternalState}
68   * given a source state (referenced by its numeric ID) and the current
69   * character.
70   */
71  private final InternalState[][] stateTable;
72
73  /**
74   * Records all DEFAULT state transitions. These are transitions provided
75   * using the {@code "[:default:]"} syntax in the Python configuration file.
76   * There can be only one such transition for any given source state, hence
77   * the array is one dimensional.
78   */
79  private final InternalState[] defaultStateTable;
80
81  public ParserStateTable() {
82    stateTable = new InternalState[MAX_STATES][MAX_CHARS];
83    defaultStateTable = new InternalState[MAX_STATES];
84  }
85
86  /**
87   * Returns the state to go to when receiving the current {@code char}
88   * in the {@code from} state.
89   * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
90   * state transition for that character and no default state transition
91   * for that state.
92   *
93   * <p>For ASCII characters, first look-up an explicit state transition for
94   * the current character. If none is found, look-up a default transition. For
95   * non-ASCII characters, look-up a default transition only.
96   *
97   * @param from the source state
98   * @param currentChar the character received
99   * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
100   */
101  InternalState getNextState(InternalState from, int currentChar) {
102    // TODO: Consider throwing run-time error here.
103    if (from == null || currentChar < 0)
104      return InternalState.INTERNAL_ERROR_STATE;
105
106    int id = from.getId();
107    if (id < 0 || id >= MAX_STATES) {
108      return InternalState.INTERNAL_ERROR_STATE;
109    }
110
111    InternalState result = null;
112    if (currentChar < MAX_CHARS) {
113      result = stateTable[id][currentChar];
114    }
115    if (result == null) {
116        result = defaultStateTable[from.getId()];
117    }
118    return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
119  }
120
121  void setExpression(String expr, InternalState from, InternalState to) {
122    if ((expr == null) || (from == null) || (to == null)) {
123      return;
124    }
125
126    // This special string indicates a default state transition.
127    if ("[:default:]".equals(expr)) {
128      setDefaultDestination(from, to);
129      return;
130    }
131    int i = 0;
132    while (i < expr.length()) {
133      // If next char is a '-' which is not the last character of the expr
134      if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
135        setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
136        i += 2;
137      } else {
138        setDestination(from, expr.charAt(i), to);
139        i++;
140      }
141    }
142  }
143
144  private void fill(InternalState from, InternalState to) {
145    char c;
146    for (c = 0; c < MAX_CHARS; c++) {
147      setDestination(from, c, to);
148    }
149  }
150
151  private void setDefaultDestination(InternalState from, InternalState to) {
152    Preconditions.checkNotNull(from);   // Developer error if it triggers
153    Preconditions.checkNotNull(to);     // Developer error if it triggers
154    int id = from.getId();
155    if ((id < 0) || (id >= MAX_STATES)) {
156      return;
157    }
158    // TODO: Consider asserting if there was a state transition defined.
159    defaultStateTable[from.getId()] = to;
160  }
161
162  private void setDestination(InternalState from, char chr, InternalState to) {
163    Preconditions.checkNotNull(from);   // Developer error if it triggers
164    Preconditions.checkNotNull(to);     // Developer error if it triggers
165    Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
166                                "char must be in ASCII set: %c", chr);
167    int id = from.getId();
168    if ((id < 0) || (id >= MAX_STATES)) {
169      return;
170    }
171    stateTable[from.getId()][chr] = to;
172  }
173
174  private void setRange(InternalState from, char start, char end,
175                        InternalState to) {
176    // Developer error if either trigger.
177    Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
178                                "char must be in ASCII set: %c", start);
179    Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
180                                "char must be in ASCII set: %c", end);
181    char c;
182    for (c = start; c <= end; c++) {
183      setDestination(from, c, to);
184    }
185  }
186}
187