1// Copyright 2005-2008 Google Inc. All Rights Reserved.
2// Author: jrm@google.com (Jim Meehan)
3
4#include <google/protobuf/stubs/common.h>
5
6#include <google/protobuf/stubs/stringpiece.h>
7
8namespace google {
9namespace protobuf {
10namespace internal {
11
12// These four-byte entries compactly encode how many bytes 0..255 to delete
13// in making a string replacement, how many bytes to add 0..255, and the offset
14// 0..64k-1 of the replacement string in remap_string.
15struct RemapEntry {
16  uint8 delete_bytes;
17  uint8 add_bytes;
18  uint16 bytes_offset;
19};
20
21// Exit type codes for state tables. All but the first get stuffed into
22// signed one-byte entries. The first is only generated by executable code.
23// To distinguish from next-state entries, these must be contiguous and
24// all <= kExitNone
25typedef enum {
26  kExitDstSpaceFull = 239,
27  kExitIllegalStructure,  // 240
28  kExitOK,                // 241
29  kExitReject,            // ...
30  kExitReplace1,
31  kExitReplace2,
32  kExitReplace3,
33  kExitReplace21,
34  kExitReplace31,
35  kExitReplace32,
36  kExitReplaceOffset1,
37  kExitReplaceOffset2,
38  kExitReplace1S0,
39  kExitSpecial,
40  kExitDoAgain,
41  kExitRejectAlt,
42  kExitNone               // 255
43} ExitReason;
44
45
46// This struct represents one entire state table. The three initialized byte
47// areas are state_table, remap_base, and remap_string. state0 and state0_size
48// give the byte offset and length within state_table of the initial state --
49// table lookups are expected to start and end in this state, but for
50// truncated UTF-8 strings, may end in a different state. These allow a quick
51// test for that condition. entry_shift is 8 for tables subscripted by a full
52// byte value and 6 for space-optimized tables subscripted by only six
53// significant bits in UTF-8 continuation bytes.
54typedef struct {
55  const uint32 state0;
56  const uint32 state0_size;
57  const uint32 total_size;
58  const int max_expand;
59  const int entry_shift;
60  const int bytes_per_entry;
61  const uint32 losub;
62  const uint32 hiadd;
63  const uint8* state_table;
64  const RemapEntry* remap_base;
65  const uint8* remap_string;
66  const uint8* fast_state;
67} UTF8StateMachineObj;
68
69typedef UTF8StateMachineObj UTF8ScanObj;
70
71#define X__ (kExitIllegalStructure)
72#define RJ_ (kExitReject)
73#define S1_ (kExitReplace1)
74#define S2_ (kExitReplace2)
75#define S3_ (kExitReplace3)
76#define S21 (kExitReplace21)
77#define S31 (kExitReplace31)
78#define S32 (kExitReplace32)
79#define T1_ (kExitReplaceOffset1)
80#define T2_ (kExitReplaceOffset2)
81#define S11 (kExitReplace1S0)
82#define SP_ (kExitSpecial)
83#define D__ (kExitDoAgain)
84#define RJA (kExitRejectAlt)
85
86//  Entire table has 9 state blocks of 256 entries each
87static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
88static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
89static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
90static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
91static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
92static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
93static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
94static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
95
96static const uint8 utf8acceptnonsurrogates[] = {
97// state[0] 0x000000 Byte 1
98  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
99  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
100  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
101  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
102
103  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
104  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
105  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
106  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
107
108X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
109X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
110X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
111X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
112
113X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
114  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
115  2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
116  4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
117
118// state[1] 0x000080 Byte 2 of 2
119X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
120X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
121X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
122X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
123
124X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
125X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
126X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
127X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
128
129  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
132  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
133
134X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
135X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
136X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
137X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
138
139// state[2] 0x000000 Byte 2 of 3
140X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
142X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
143X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
144
145X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
147X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
148X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149
150X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
151X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
152  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
153  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
154
155X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
158X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
159
160// state[3] 0x001000 Byte 2 of 3
161X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
162X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
163X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
164X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165
166X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
168X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
169X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
170
171  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
172  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
173  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
174  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
175
176X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
179X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
180
181// state[4] 0x000000 Byte 2 of 4
182X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
183X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
184X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
185X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186
187X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
189X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
190X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191
192X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
193  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
194  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
195  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
196
197X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
200X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
201
202// state[5] 0x040000 Byte 2 of 4
203X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
204X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
205X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
206X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207
208X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
210X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
211X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
212
213  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
214  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
215  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
216  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
217
218X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
221X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
222
223// state[6] 0x100000 Byte 2 of 4
224X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
225X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
226X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
227X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228
229X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
231X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
232X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
233
234  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
235X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
237X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238
239X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
242X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
243
244// state[7] 0x00d000 Byte 2 of 3
245X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
246X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
247X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
248X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249
250X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
252X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
253X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
254
255  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
256  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
257  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
258  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
259
260X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
263X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
264
265// state[8] 0x00d800 Byte 3 of 3
266X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
268X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
269X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270
271X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
273X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
274X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
275
276RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
279RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
280
281X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
284X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
285};
286
287// Remap base[0] = (del, add, string_offset)
288static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
289{0, 0, 0} };
290
291// Remap string[0]
292static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
2930 };
294
295static const unsigned char utf8acceptnonsurrogates_fast[256] = {
2960, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2970, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2980, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
2990, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
300
3010, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3020, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3030, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
3040, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
305
3061, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3071, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3081, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3091, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
310
3111, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3121, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3131, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
3141, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
315};
316
317static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
318  utf8acceptnonsurrogates_STATE0,
319  utf8acceptnonsurrogates_STATE0_SIZE,
320  utf8acceptnonsurrogates_TOTAL_SIZE,
321  utf8acceptnonsurrogates_MAX_EXPAND_X4,
322  utf8acceptnonsurrogates_SHIFT,
323  utf8acceptnonsurrogates_BYTES,
324  utf8acceptnonsurrogates_LOSUB,
325  utf8acceptnonsurrogates_HIADD,
326  utf8acceptnonsurrogates,
327  utf8acceptnonsurrogates_remap_base,
328  utf8acceptnonsurrogates_remap_string,
329  utf8acceptnonsurrogates_fast
330};
331
332
333#undef X__
334#undef RJ_
335#undef S1_
336#undef S2_
337#undef S3_
338#undef S21
339#undef S31
340#undef S32
341#undef T1_
342#undef T2_
343#undef S11
344#undef SP_
345#undef D__
346#undef RJA
347
348// Return true if current Tbl pointer is within state0 range
349// Note that unsigned compare checks both ends of range simultaneously
350static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
351  const uint8* Tbl0 = &st->state_table[st->state0];
352  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
353}
354
355// Scan a UTF-8 string based on state table.
356// Always scan complete UTF-8 characters
357// Set number of bytes scanned. Return reason for exiting
358int UTF8GenericScan(const UTF8ScanObj* st,
359                    const char * str,
360                    int str_length,
361                    int* bytes_consumed) {
362  *bytes_consumed = 0;
363  if (str_length == 0) return kExitOK;
364
365  int eshift = st->entry_shift;
366  const uint8* isrc = reinterpret_cast<const uint8*>(str);
367  const uint8* src = isrc;
368  const uint8* srclimit = isrc + str_length;
369  const uint8* srclimit8 = srclimit - 7;
370  const uint8* Tbl_0 = &st->state_table[st->state0];
371
372 DoAgain:
373  // Do state-table scan
374  int e = 0;
375  uint8 c;
376  const uint8* Tbl2 = &st->fast_state[0];
377  const uint32 losub = st->losub;
378  const uint32 hiadd = st->hiadd;
379  // Check initial few bytes one at a time until 8-byte aligned
380  //----------------------------
381  while ((((uintptr_t)src & 0x07) != 0) &&
382         (src < srclimit) &&
383         Tbl2[src[0]] == 0) {
384    src++;
385  }
386  if (((uintptr_t)src & 0x07) == 0) {
387    // Do fast for groups of 8 identity bytes.
388    // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
389    // including slowing slightly on cr/lf/ht
390    //----------------------------
391    while (src < srclimit8) {
392      uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
393      uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
394      src += 8;
395      // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
396      uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
397                    (s4567 - losub) | (s4567 + hiadd);
398      if ((temp & 0x80808080) != 0) {
399        // We typically end up here on cr/lf/ht; src was incremented
400        int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
401                    (Tbl2[src[-6]] | Tbl2[src[-5]]);
402        if (e0123 != 0) {
403          src -= 8;
404          break;
405        }    // Exit on Non-interchange
406        e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
407                (Tbl2[src[-2]] | Tbl2[src[-1]]);
408        if (e0123 != 0) {
409          src -= 4;
410          break;
411        }    // Exit on Non-interchange
412        // Else OK, go around again
413      }
414    }
415  }
416  //----------------------------
417
418  // Byte-at-a-time scan
419  //----------------------------
420  const uint8* Tbl = Tbl_0;
421  while (src < srclimit) {
422    c = *src;
423    e = Tbl[c];
424    src++;
425    if (e >= kExitIllegalStructure) {break;}
426    Tbl = &Tbl_0[e << eshift];
427  }
428  //----------------------------
429
430
431  // Exit posibilities:
432  //  Some exit code, !state0, back up over last char
433  //  Some exit code, state0, back up one byte exactly
434  //  source consumed, !state0, back up over partial char
435  //  source consumed, state0, exit OK
436  // For illegal byte in state0, avoid backup up over PREVIOUS char
437  // For truncated last char, back up to beginning of it
438
439  if (e >= kExitIllegalStructure) {
440    // Back up over exactly one byte of rejected/illegal UTF-8 character
441    src--;
442    // Back up more if needed
443    if (!InStateZero(st, Tbl)) {
444      do {
445        src--;
446      } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
447    }
448  } else if (!InStateZero(st, Tbl)) {
449    // Back up over truncated UTF-8 character
450    e = kExitIllegalStructure;
451    do {
452      src--;
453    } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
454  } else {
455    // Normal termination, source fully consumed
456    e = kExitOK;
457  }
458
459  if (e == kExitDoAgain) {
460    // Loop back up to the fast scan
461    goto DoAgain;
462  }
463
464  *bytes_consumed = src - isrc;
465  return e;
466}
467
468int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
469                    const char * str,
470                    int str_length,
471                    int* bytes_consumed) {
472  *bytes_consumed = 0;
473  if (str_length == 0) return kExitOK;
474
475  const uint8* isrc =  reinterpret_cast<const uint8*>(str);
476  const uint8* src = isrc;
477  const uint8* srclimit = isrc + str_length;
478  const uint8* srclimit8 = srclimit - 7;
479  int n;
480  int rest_consumed;
481  int exit_reason;
482  do {
483    // Check initial few bytes one at a time until 8-byte aligned
484    while ((((uintptr_t)src & 0x07) != 0) &&
485           (src < srclimit) && (src[0] < 0x80)) {
486      src++;
487    }
488    if (((uintptr_t)src & 0x07) == 0) {
489      while ((src < srclimit8) &&
490             (((reinterpret_cast<const uint32*>(src)[0] |
491                reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
492        src += 8;
493      }
494    }
495    while ((src < srclimit) && (src[0] < 0x80)) {
496      src++;
497    }
498    // Run state table on the rest
499    n = src - isrc;
500    exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
501    src += rest_consumed;
502  } while ( exit_reason == kExitDoAgain );
503
504  *bytes_consumed = src - isrc;
505  return exit_reason;
506}
507
508// Hack:  On some compilers the static tables are initialized at startup.
509//   We can't use them until they are initialized.  However, some Protocol
510//   Buffer parsing happens at static init time and may try to validate
511//   UTF-8 strings.  Since UTF-8 validation is only used for debugging
512//   anyway, we simply always return success if initialization hasn't
513//   occurred yet.
514namespace {
515
516bool module_initialized_ = false;
517
518struct InitDetector {
519  InitDetector() {
520    module_initialized_ = true;
521  }
522};
523InitDetector init_detector;
524
525}  // namespace
526
527bool IsStructurallyValidUTF8(const char* buf, int len) {
528  if (!module_initialized_) return true;
529
530  int bytes_consumed = 0;
531  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
532                           buf, len, &bytes_consumed);
533  return (bytes_consumed == len);
534}
535
536int UTF8SpnStructurallyValid(const StringPiece& str) {
537  if (!module_initialized_) return str.size();
538
539  int bytes_consumed = 0;
540  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
541                           str.data(), str.size(), &bytes_consumed);
542  return bytes_consumed;
543}
544
545// Coerce UTF-8 byte string in src_str to be
546// a structurally-valid equal-length string by selectively
547// overwriting illegal bytes with replace_char (typically blank).
548// replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
549// src_str is read-only. If any overwriting is needed, a modified byte string
550// is created in idst, length isrclen.
551//
552// Returns pointer to output buffer, isrc if no changes were made,
553//  or idst if some bytes were changed.
554//
555// Fast case: all is structurally valid and no byte copying is done.
556//
557char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
558                                    char* idst,
559                                    const char replace_char) {
560  const char* isrc = src_str.data();
561  const int len = src_str.length();
562  int n = UTF8SpnStructurallyValid(src_str);
563  if (n == len) {               // Normal case -- all is cool, return
564    return const_cast<char*>(isrc);
565  } else {                      // Unusual case -- copy w/o bad bytes
566    const char* src = isrc;
567    const char* srclimit = isrc + len;
568    char* dst = idst;
569    memmove(dst, src, n);       // Copy initial good chunk
570    src += n;
571    dst += n;
572    while (src < srclimit) {    // src points to bogus byte or is off the end
573      dst[0] = replace_char;                    // replace one bad byte
574      src++;
575      dst++;
576      StringPiece str2(src, srclimit - src);
577      n = UTF8SpnStructurallyValid(str2);       // scan the remainder
578      memmove(dst, src, n);                     // copy next good chunk
579      src += n;
580      dst += n;
581    }
582  }
583  return idst;
584}
585
586}  // namespace internal
587}  // namespace protobuf
588}  // namespace google
589