1/* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.streamhtmlparser.impl; 18 19import com.google.common.base.Preconditions; 20 21/** 22 * Holds a state table which is defined as the set of all 23 * recognized state transitions and the set of characters that 24 * trigger them. 25 * 26 * <p>The logic of what character causes what state transition is derived from 27 * a base definition written as a Python configuration file in the original 28 * C++ parser. 29 * 30 * <p>This class provides methods to initially build the state table and then 31 * methods at parsing time to determine the transitions to subsequent states. 32 * 33 * <p>Note on characters outside the extended ASCII range: Currently, all state 34 * transitions in the Python configuration file trigger only on extended 35 * ASCII characters, that is characters in the Unicode space of [U+0000 to 36 * U+00FF]. We use that property to design a more efficient state transition 37 * representation. When receiving characters outside that ASCII range, we 38 * simply apply the DEFAULT transition for the given state - as we do for any 39 * character that is not a hot character for that state. If no default 40 * transition exists, we switch to the Internal Error state. 41 * 42 * <p>Technical note: In Java, a {@code char} is a code unit and in some cases 43 * may not represent a complete Unicode code point. However, when that happens, 44 * the code units that follow for that code point are all in the surrogate area 45 * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger 46 * any incorrect state transitions. 47 * 48 * <p>This class is storage-inefficient but it is static at least 49 * and not generated for each Parser instance. 50 */ 51class ParserStateTable { 52 53 /** 54 * A limit on how many different states we can have in one state table. 55 * Can be increased should it no longer be sufficient. 56 */ 57 private static final int MAX_STATES = 256; 58 59 /** 60 * We only check transitions for (extended) ASCII characters, hence 61 * characters in the range 0 to MAX_CHARS -1. 62 */ 63 private static final int MAX_CHARS = 256; 64 65 /** 66 * Records all state transitions except those identified as DEFAULT 67 * transitions. It is two dimensional: Stores a target {@code InternalState} 68 * given a source state (referenced by its numeric ID) and the current 69 * character. 70 */ 71 private final InternalState[][] stateTable; 72 73 /** 74 * Records all DEFAULT state transitions. These are transitions provided 75 * using the {@code "[:default:]"} syntax in the Python configuration file. 76 * There can be only one such transition for any given source state, hence 77 * the array is one dimensional. 78 */ 79 private final InternalState[] defaultStateTable; 80 81 public ParserStateTable() { 82 stateTable = new InternalState[MAX_STATES][MAX_CHARS]; 83 defaultStateTable = new InternalState[MAX_STATES]; 84 } 85 86 /** 87 * Returns the state to go to when receiving the current {@code char} 88 * in the {@code from} state. 89 * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no 90 * state transition for that character and no default state transition 91 * for that state. 92 * 93 * <p>For ASCII characters, first look-up an explicit state transition for 94 * the current character. If none is found, look-up a default transition. For 95 * non-ASCII characters, look-up a default transition only. 96 * 97 * @param from the source state 98 * @param currentChar the character received 99 * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE} 100 */ 101 InternalState getNextState(InternalState from, int currentChar) { 102 // TODO: Consider throwing run-time error here. 103 if (from == null || currentChar < 0) 104 return InternalState.INTERNAL_ERROR_STATE; 105 106 int id = from.getId(); 107 if (id < 0 || id >= MAX_STATES) { 108 return InternalState.INTERNAL_ERROR_STATE; 109 } 110 111 InternalState result = null; 112 if (currentChar < MAX_CHARS) { 113 result = stateTable[id][currentChar]; 114 } 115 if (result == null) { 116 result = defaultStateTable[from.getId()]; 117 } 118 return result != null ? result : InternalState.INTERNAL_ERROR_STATE; 119 } 120 121 void setExpression(String expr, InternalState from, InternalState to) { 122 if ((expr == null) || (from == null) || (to == null)) { 123 return; 124 } 125 126 // This special string indicates a default state transition. 127 if ("[:default:]".equals(expr)) { 128 setDefaultDestination(from, to); 129 return; 130 } 131 int i = 0; 132 while (i < expr.length()) { 133 // If next char is a '-' which is not the last character of the expr 134 if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') { 135 setRange(from, expr.charAt(i), expr.charAt(i + 2), to); 136 i += 2; 137 } else { 138 setDestination(from, expr.charAt(i), to); 139 i++; 140 } 141 } 142 } 143 144 private void fill(InternalState from, InternalState to) { 145 char c; 146 for (c = 0; c < MAX_CHARS; c++) { 147 setDestination(from, c, to); 148 } 149 } 150 151 private void setDefaultDestination(InternalState from, InternalState to) { 152 Preconditions.checkNotNull(from); // Developer error if it triggers 153 Preconditions.checkNotNull(to); // Developer error if it triggers 154 int id = from.getId(); 155 if ((id < 0) || (id >= MAX_STATES)) { 156 return; 157 } 158 // TODO: Consider asserting if there was a state transition defined. 159 defaultStateTable[from.getId()] = to; 160 } 161 162 private void setDestination(InternalState from, char chr, InternalState to) { 163 Preconditions.checkNotNull(from); // Developer error if it triggers 164 Preconditions.checkNotNull(to); // Developer error if it triggers 165 Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS, 166 "char must be in ASCII set: %c", chr); 167 int id = from.getId(); 168 if ((id < 0) || (id >= MAX_STATES)) { 169 return; 170 } 171 stateTable[from.getId()][chr] = to; 172 } 173 174 private void setRange(InternalState from, char start, char end, 175 InternalState to) { 176 // Developer error if either trigger. 177 Preconditions.checkArgument(start >= 0 && start < MAX_CHARS, 178 "char must be in ASCII set: %c", start); 179 Preconditions.checkArgument(end >= 0 && end < MAX_CHARS, 180 "char must be in ASCII set: %c", end); 181 char c; 182 for (c = start; c <= end; c++) { 183 setDestination(from, c, to); 184 } 185 } 186} 187