1// Copyright 2005-2008 Google Inc. All Rights Reserved.
2// Author: jrm@google.com (Jim Meehan)
3
4#include <google/protobuf/stubs/common.h>
5
6namespace google {
7namespace protobuf {
8namespace internal {
9
10// These four-byte entries compactly encode how many bytes 0..255 to delete
11// in making a string replacement, how many bytes to add 0..255, and the offset
12// 0..64k-1 of the replacement string in remap_string.
13struct RemapEntry {
14  uint8 delete_bytes;
15  uint8 add_bytes;
16  uint16 bytes_offset;
17};
18
19// Exit type codes for state tables. All but the first get stuffed into
20// signed one-byte entries. The first is only generated by executable code.
21// To distinguish from next-state entries, these must be contiguous and
22// all <= kExitNone
23typedef enum {
24  kExitDstSpaceFull = 239,
25  kExitIllegalStructure,  // 240
26  kExitOK,                // 241
27  kExitReject,            // ...
28  kExitReplace1,
29  kExitReplace2,
30  kExitReplace3,
31  kExitReplace21,
32  kExitReplace31,
33  kExitReplace32,
34  kExitReplaceOffset1,
35  kExitReplaceOffset2,
36  kExitReplace1S0,
37  kExitSpecial,
38  kExitDoAgain,
39  kExitRejectAlt,
40  kExitNone               // 255
41} ExitReason;
42
43
44// This struct represents one entire state table. The three initialized byte
45// areas are state_table, remap_base, and remap_string. state0 and state0_size
46// give the byte offset and length within state_table of the initial state --
47// table lookups are expected to start and end in this state, but for
48// truncated UTF-8 strings, may end in a different state. These allow a quick
49// test for that condition. entry_shift is 8 for tables subscripted by a full
50// byte value and 6 for space-optimized tables subscripted by only six
51// significant bits in UTF-8 continuation bytes.
52typedef struct {
53  const uint32 state0;
54  const uint32 state0_size;
55  const uint32 total_size;
56  const int max_expand;
57  const int entry_shift;
58  const int bytes_per_entry;
59  const uint32 losub;
60  const uint32 hiadd;
61  const uint8* state_table;
62  const RemapEntry* remap_base;
63  const uint8* remap_string;
64  const uint8* fast_state;
65} UTF8StateMachineObj;
66
67typedef UTF8StateMachineObj UTF8ScanObj;
68
69#define X__ (kExitIllegalStructure)
70#define RJ_ (kExitReject)
71#define S1_ (kExitReplace1)
72#define S2_ (kExitReplace2)
73#define S3_ (kExitReplace3)
74#define S21 (kExitReplace21)
75#define S31 (kExitReplace31)
76#define S32 (kExitReplace32)
77#define T1_ (kExitReplaceOffset1)
78#define T2_ (kExitReplaceOffset2)
79#define S11 (kExitReplace1S0)
80#define SP_ (kExitSpecial)
81#define D__ (kExitDoAgain)
82#define RJA (kExitRejectAlt)
83
84//  Entire table has 9 state blocks of 256 entries each
85static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
86static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
87static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
88static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
89static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
90static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
91static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
92static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
93
94static const uint8 utf8acceptnonsurrogates[] = {
95// state[0] 0x000000 Byte 1
96  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
97  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
98  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
99  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
100
101  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
102  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
103  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
104  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
105
106X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
107X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
108X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
109X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
110
111X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
112  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
113  2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
114  4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
115
116// state[1] 0x000080 Byte 2 of 2
117X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
118X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
119X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
120X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
121
122X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
123X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
124X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
125X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
126
127  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
128  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
129  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131
132X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
133X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
134X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
135X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
136
137// state[2] 0x000000 Byte 2 of 3
138X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
139X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
140X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
142
143X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
144X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
145X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
147
148X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
150  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
151  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
152
153X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
154X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
155X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157
158// state[3] 0x001000 Byte 2 of 3
159X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
160X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
161X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
162X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
163
164X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
166X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
168
169  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
170  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
171  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
172  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
173
174X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
175X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
176X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178
179// state[4] 0x000000 Byte 2 of 4
180X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
181X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
182X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
183X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
184
185X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
187X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
189
190X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
192  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
193  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
194
195X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
196X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
197X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199
200// state[5] 0x040000 Byte 2 of 4
201X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
202X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
203X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
204X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
205
206X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
208X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
210
211  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
212  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
213  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
214  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
215
216X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
217X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
218X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220
221// state[6] 0x100000 Byte 2 of 4
222X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
223X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
224X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
225X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
226
227X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
229X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
231
232  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
233X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
234X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
235X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236
237X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
239X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241
242// state[7] 0x00d000 Byte 2 of 3
243X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
244X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
245X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
246X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
247
248X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
250X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
252
253  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
254  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
255  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
256  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
257
258X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
259X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
260X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262
263// state[8] 0x00d800 Byte 3 of 3
264X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
265X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
266X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
268
269X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
271X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
273
274RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
275RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
276RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278
279X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
280X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
281X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283};
284
285// Remap base[0] = (del, add, string_offset)
286static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
287{0, 0, 0} };
288
289// Remap string[0]
290static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
2910 };
292
293static const unsigned char utf8acceptnonsurrogates_fast[256] = {
2940, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2950, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2960, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2970, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
298
2990, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3000, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3010, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3020, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
303
3041, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3051, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3061, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3071, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
308
3091, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3101, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3111, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3121, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
313};
314
315static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
316  utf8acceptnonsurrogates_STATE0,
317  utf8acceptnonsurrogates_STATE0_SIZE,
318  utf8acceptnonsurrogates_TOTAL_SIZE,
319  utf8acceptnonsurrogates_MAX_EXPAND_X4,
320  utf8acceptnonsurrogates_SHIFT,
321  utf8acceptnonsurrogates_BYTES,
322  utf8acceptnonsurrogates_LOSUB,
323  utf8acceptnonsurrogates_HIADD,
324  utf8acceptnonsurrogates,
325  utf8acceptnonsurrogates_remap_base,
326  utf8acceptnonsurrogates_remap_string,
327  utf8acceptnonsurrogates_fast
328};
329
330
331#undef X__
332#undef RJ_
333#undef S1_
334#undef S2_
335#undef S3_
336#undef S21
337#undef S31
338#undef S32
339#undef T1_
340#undef T2_
341#undef S11
342#undef SP_
343#undef D__
344#undef RJA
345
346// Return true if current Tbl pointer is within state0 range
347// Note that unsigned compare checks both ends of range simultaneously
348static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
349  const uint8* Tbl0 = &st->state_table[st->state0];
350  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
351}
352
353// Scan a UTF-8 string based on state table.
354// Always scan complete UTF-8 characters
355// Set number of bytes scanned. Return reason for exiting
356int UTF8GenericScan(const UTF8ScanObj* st,
357                    const char * str,
358                    int str_length,
359                    int* bytes_consumed) {
360  *bytes_consumed = 0;
361  if (str_length == 0) return kExitOK;
362
363  int eshift = st->entry_shift;
364  const uint8* isrc = reinterpret_cast<const uint8*>(str);
365  const uint8* src = isrc;
366  const uint8* srclimit = isrc + str_length;
367  const uint8* srclimit8 = srclimit - 7;
368  const uint8* Tbl_0 = &st->state_table[st->state0];
369
370 DoAgain:
371  // Do state-table scan
372  int e = 0;
373  uint8 c;
374  const uint8* Tbl2 = &st->fast_state[0];
375  const uint32 losub = st->losub;
376  const uint32 hiadd = st->hiadd;
377  // Check initial few bytes one at a time until 8-byte aligned
378  //----------------------------
379  while ((((uintptr_t)src & 0x07) != 0) &&
380         (src < srclimit) &&
381         Tbl2[src[0]] == 0) {
382    src++;
383  }
384  if (((uintptr_t)src & 0x07) == 0) {
385    // Do fast for groups of 8 identity bytes.
386    // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
387    // including slowing slightly on cr/lf/ht
388    //----------------------------
389    while (src < srclimit8) {
390      uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
391      uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
392      src += 8;
393      // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
394      uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
395                    (s4567 - losub) | (s4567 + hiadd);
396      if ((temp & 0x80808080) != 0) {
397        // We typically end up here on cr/lf/ht; src was incremented
398        int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
399                    (Tbl2[src[-6]] | Tbl2[src[-5]]);
400        if (e0123 != 0) {
401          src -= 8;
402          break;
403        }    // Exit on Non-interchange
404        e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
405                (Tbl2[src[-2]] | Tbl2[src[-1]]);
406        if (e0123 != 0) {
407          src -= 4;
408          break;
409        }    // Exit on Non-interchange
410        // Else OK, go around again
411      }
412    }
413  }
414  //----------------------------
415
416  // Byte-at-a-time scan
417  //----------------------------
418  const uint8* Tbl = Tbl_0;
419  while (src < srclimit) {
420    c = *src;
421    e = Tbl[c];
422    src++;
423    if (e >= kExitIllegalStructure) {break;}
424    Tbl = &Tbl_0[e << eshift];
425  }
426  //----------------------------
427
428
429  // Exit posibilities:
430  //  Some exit code, !state0, back up over last char
431  //  Some exit code, state0, back up one byte exactly
432  //  source consumed, !state0, back up over partial char
433  //  source consumed, state0, exit OK
434  // For illegal byte in state0, avoid backup up over PREVIOUS char
435  // For truncated last char, back up to beginning of it
436
437  if (e >= kExitIllegalStructure) {
438    // Back up over exactly one byte of rejected/illegal UTF-8 character
439    src--;
440    // Back up more if needed
441    if (!InStateZero(st, Tbl)) {
442      do {
443        src--;
444      } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
445    }
446  } else if (!InStateZero(st, Tbl)) {
447    // Back up over truncated UTF-8 character
448    e = kExitIllegalStructure;
449    do {
450      src--;
451    } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
452  } else {
453    // Normal termination, source fully consumed
454    e = kExitOK;
455  }
456
457  if (e == kExitDoAgain) {
458    // Loop back up to the fast scan
459    goto DoAgain;
460  }
461
462  *bytes_consumed = src - isrc;
463  return e;
464}
465
466int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
467                    const char * str,
468                    int str_length,
469                    int* bytes_consumed) {
470  *bytes_consumed = 0;
471  if (str_length == 0) return kExitOK;
472
473  const uint8* isrc =  reinterpret_cast<const uint8*>(str);
474  const uint8* src = isrc;
475  const uint8* srclimit = isrc + str_length;
476  const uint8* srclimit8 = srclimit - 7;
477  int n;
478  int rest_consumed;
479  int exit_reason;
480  do {
481    // Check initial few bytes one at a time until 8-byte aligned
482    while ((((uintptr_t)src & 0x07) != 0) &&
483           (src < srclimit) && (src[0] < 0x80)) {
484      src++;
485    }
486    if (((uintptr_t)src & 0x07) == 0) {
487      while ((src < srclimit8) &&
488             (((reinterpret_cast<const uint32*>(src)[0] |
489                reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
490        src += 8;
491      }
492    }
493    while ((src < srclimit) && (src[0] < 0x80)) {
494      src++;
495    }
496    // Run state table on the rest
497    n = src - isrc;
498    exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
499    src += rest_consumed;
500  } while ( exit_reason == kExitDoAgain );
501
502  *bytes_consumed = src - isrc;
503  return exit_reason;
504}
505
506// Hack:  On some compilers the static tables are initialized at startup.
507//   We can't use them until they are initialized.  However, some Protocol
508//   Buffer parsing happens at static init time and may try to validate
509//   UTF-8 strings.  Since UTF-8 validation is only used for debugging
510//   anyway, we simply always return success if initialization hasn't
511//   occurred yet.
512namespace {
513
514bool module_initialized_ = false;
515
516struct InitDetector {
517  InitDetector() {
518    module_initialized_ = true;
519  }
520};
521InitDetector init_detector;
522
523}  // namespace
524
525bool IsStructurallyValidUTF8(const char* buf, int len) {
526  if (!module_initialized_) return true;
527
528  int bytes_consumed = 0;
529  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
530                           buf, len, &bytes_consumed);
531  return (bytes_consumed == len);
532}
533
534}  // namespace internal
535}  // namespace protobuf
536}  // namespace google
537