1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser.impl;
18 
19 import com.google.common.base.Preconditions;
20 
21 /**
22  * Holds a state table which is defined as the set of all
23  * recognized state transitions and the set of characters that
24  * trigger them.
25  *
26  * <p>The logic of what character causes what state transition is derived from
27  * a base definition written as a Python configuration file in the original
28  * C++ parser.
29  *
30  * <p>This class provides methods to initially build the state table and then
31  * methods at parsing time to determine the transitions to subsequent states.
32  *
33  * <p>Note on characters outside the extended ASCII range: Currently, all state
34  * transitions in the Python configuration file trigger only on extended
35  * ASCII characters, that is characters in the Unicode space of [U+0000 to
36  * U+00FF]. We use that property to design a more efficient state transition
37  * representation. When receiving characters outside that ASCII range, we
38  * simply apply the DEFAULT transition for the given state - as we do for any
39  * character that is not a hot character for that state. If no default
40  * transition exists, we switch to the Internal Error state.
41  *
42  * <p>Technical note: In Java, a {@code char} is a code unit and in some cases
43  * may not represent a complete Unicode code point. However, when that happens,
44  * the code units that follow for that code point are all in the surrogate area
45  * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
46  * any incorrect state transitions.
47  *
48  * <p>This class is storage-inefficient but it is static at least
49  * and not generated for each Parser instance.
50  */
51 class ParserStateTable {
52 
53   /**
54    * A limit on how many different states we can have in one state table.
55    * Can be increased should it no longer be sufficient.
56    */
57   private static final int MAX_STATES = 256;
58 
59   /**
60    * We only check transitions for (extended) ASCII characters, hence
61    * characters in the range 0 to MAX_CHARS -1.
62    */
63   private static final int MAX_CHARS = 256;
64 
65   /**
66    * Records all state transitions except those identified as DEFAULT
67    * transitions. It is two dimensional: Stores a target {@code InternalState}
68    * given a source state (referenced by its numeric ID) and the current
69    * character.
70    */
71   private final InternalState[][] stateTable;
72 
73   /**
74    * Records all DEFAULT state transitions. These are transitions provided
75    * using the {@code "[:default:]"} syntax in the Python configuration file.
76    * There can be only one such transition for any given source state, hence
77    * the array is one dimensional.
78    */
79   private final InternalState[] defaultStateTable;
80 
ParserStateTable()81   public ParserStateTable() {
82     stateTable = new InternalState[MAX_STATES][MAX_CHARS];
83     defaultStateTable = new InternalState[MAX_STATES];
84   }
85 
86   /**
87    * Returns the state to go to when receiving the current {@code char}
88    * in the {@code from} state.
89    * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
90    * state transition for that character and no default state transition
91    * for that state.
92    *
93    * <p>For ASCII characters, first look-up an explicit state transition for
94    * the current character. If none is found, look-up a default transition. For
95    * non-ASCII characters, look-up a default transition only.
96    *
97    * @param from the source state
98    * @param currentChar the character received
99    * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
100    */
getNextState(InternalState from, int currentChar)101   InternalState getNextState(InternalState from, int currentChar) {
102     // TODO: Consider throwing run-time error here.
103     if (from == null || currentChar < 0)
104       return InternalState.INTERNAL_ERROR_STATE;
105 
106     int id = from.getId();
107     if (id < 0 || id >= MAX_STATES) {
108       return InternalState.INTERNAL_ERROR_STATE;
109     }
110 
111     InternalState result = null;
112     if (currentChar < MAX_CHARS) {
113       result = stateTable[id][currentChar];
114     }
115     if (result == null) {
116         result = defaultStateTable[from.getId()];
117     }
118     return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
119   }
120 
setExpression(String expr, InternalState from, InternalState to)121   void setExpression(String expr, InternalState from, InternalState to) {
122     if ((expr == null) || (from == null) || (to == null)) {
123       return;
124     }
125 
126     // This special string indicates a default state transition.
127     if ("[:default:]".equals(expr)) {
128       setDefaultDestination(from, to);
129       return;
130     }
131     int i = 0;
132     while (i < expr.length()) {
133       // If next char is a '-' which is not the last character of the expr
134       if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
135         setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
136         i += 2;
137       } else {
138         setDestination(from, expr.charAt(i), to);
139         i++;
140       }
141     }
142   }
143 
fill(InternalState from, InternalState to)144   private void fill(InternalState from, InternalState to) {
145     char c;
146     for (c = 0; c < MAX_CHARS; c++) {
147       setDestination(from, c, to);
148     }
149   }
150 
setDefaultDestination(InternalState from, InternalState to)151   private void setDefaultDestination(InternalState from, InternalState to) {
152     Preconditions.checkNotNull(from);   // Developer error if it triggers
153     Preconditions.checkNotNull(to);     // Developer error if it triggers
154     int id = from.getId();
155     if ((id < 0) || (id >= MAX_STATES)) {
156       return;
157     }
158     // TODO: Consider asserting if there was a state transition defined.
159     defaultStateTable[from.getId()] = to;
160   }
161 
setDestination(InternalState from, char chr, InternalState to)162   private void setDestination(InternalState from, char chr, InternalState to) {
163     Preconditions.checkNotNull(from);   // Developer error if it triggers
164     Preconditions.checkNotNull(to);     // Developer error if it triggers
165     Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
166                                 "char must be in ASCII set: %c", chr);
167     int id = from.getId();
168     if ((id < 0) || (id >= MAX_STATES)) {
169       return;
170     }
171     stateTable[from.getId()][chr] = to;
172   }
173 
setRange(InternalState from, char start, char end, InternalState to)174   private void setRange(InternalState from, char start, char end,
175                         InternalState to) {
176     // Developer error if either trigger.
177     Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
178                                 "char must be in ASCII set: %c", start);
179     Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
180                                 "char must be in ASCII set: %c", end);
181     char c;
182     for (c = start; c <= end; c++) {
183       setDestination(from, c, to);
184     }
185   }
186 }
187