1fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Copyright 2005-2008 Google Inc. All Rights Reserved.
2fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Author: jrm@google.com (Jim Meehan)
3fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
4fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#include <google/protobuf/stubs/common.h>
5fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
6b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer#include <google/protobuf/stubs/stringpiece.h>
7b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer
8fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace google {
9fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace protobuf {
10fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace internal {
11fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
12fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// These four-byte entries compactly encode how many bytes 0..255 to delete
13fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// in making a string replacement, how many bytes to add 0..255, and the offset
14fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// 0..64k-1 of the replacement string in remap_string.
15fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestruct RemapEntry {
16fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  uint8 delete_bytes;
17fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  uint8 add_bytes;
18fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  uint16 bytes_offset;
19fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
20fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
21fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Exit type codes for state tables. All but the first get stuffed into
22fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// signed one-byte entries. The first is only generated by executable code.
23fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// To distinguish from next-state entries, these must be contiguous and
24fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// all <= kExitNone
25fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef enum {
26fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitDstSpaceFull = 239,
27fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitIllegalStructure,  // 240
28fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitOK,                // 241
29fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReject,            // ...
30fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace1,
31fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace2,
32fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace3,
33fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace21,
34fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace31,
35fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace32,
36fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplaceOffset1,
37fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplaceOffset2,
38fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitReplace1S0,
39fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitSpecial,
40fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitDoAgain,
41fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitRejectAlt,
42fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  kExitNone               // 255
43fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} ExitReason;
44fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
45fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
46fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// This struct represents one entire state table. The three initialized byte
47fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// areas are state_table, remap_base, and remap_string. state0 and state0_size
48fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// give the byte offset and length within state_table of the initial state --
49fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// table lookups are expected to start and end in this state, but for
50fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// truncated UTF-8 strings, may end in a different state. These allow a quick
51fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// test for that condition. entry_shift is 8 for tables subscripted by a full
52fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// byte value and 6 for space-optimized tables subscripted by only six
53fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// significant bits in UTF-8 continuation bytes.
54fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef struct {
55fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint32 state0;
56fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint32 state0_size;
57fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint32 total_size;
58fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const int max_expand;
59fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const int entry_shift;
60fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const int bytes_per_entry;
61fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint32 losub;
62fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint32 hiadd;
63fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* state_table;
64fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const RemapEntry* remap_base;
65fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* remap_string;
66fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* fast_state;
67fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} UTF8StateMachineObj;
68fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
69fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef UTF8StateMachineObj UTF8ScanObj;
70fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
71fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define X__ (kExitIllegalStructure)
72fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define RJ_ (kExitReject)
73fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S1_ (kExitReplace1)
74fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S2_ (kExitReplace2)
75fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S3_ (kExitReplace3)
76fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S21 (kExitReplace21)
77fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S31 (kExitReplace31)
78fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S32 (kExitReplace32)
79fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define T1_ (kExitReplaceOffset1)
80fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define T2_ (kExitReplaceOffset2)
81fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S11 (kExitReplace1S0)
82fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define SP_ (kExitSpecial)
83fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define D__ (kExitDoAgain)
84fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define RJA (kExitRejectAlt)
85fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
86fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//  Entire table has 9 state blocks of 256 entries each
87fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
88fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
89fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
90fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
91fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
92fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_BYTES = 1;
93fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
94fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
95fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
96fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const uint8 utf8acceptnonsurrogates[] = {
97fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[0] 0x000000 Byte 1
98fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
99fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
100fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
101fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
102fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
103fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
104fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
105fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
106fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
107fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
108fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
109fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
110fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
111fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
112fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
113fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
114fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
115fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
116fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
117fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
118fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[1] 0x000080 Byte 2 of 2
119fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
120fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
121fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
122fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
123fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
124fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
125fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
126fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
127fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
128fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
129fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
132fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
133fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
134fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
135fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
136fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
137fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
138fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
139fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[2] 0x000000 Byte 2 of 3
140fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
142fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
143fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
144fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
145fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
147fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
148fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
150fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
151fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
152fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
153fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
154fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
155fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
158fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
159fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
160fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[3] 0x001000 Byte 2 of 3
161fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
162fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
163fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
164fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
166fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
168fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
169fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
170fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
171fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
172fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
173fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
174fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
175fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
176fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
179fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
180fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
181fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[4] 0x000000 Byte 2 of 4
182fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
183fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
184fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
185fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
187fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
189fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
190fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
192fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
193fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
194fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
195fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
196fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
197fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
200fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
201fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
202fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[5] 0x040000 Byte 2 of 4
203fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
204fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
205fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
206fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
208fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
210fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
211fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
212fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
213fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
214fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
215fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
216fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
217fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
218fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
221fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
222fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
223fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[6] 0x100000 Byte 2 of 4
224fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
225fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
226fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
227fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
229fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
231fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
232fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
233fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
234fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
235fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
237fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
239fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
242fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
243fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
244fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[7] 0x00d000 Byte 2 of 3
245fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
246fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
247fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
248fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
250fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
252fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
253fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
254fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
255fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
256fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
257fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
258fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
259fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
260fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
263fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
264fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
265fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[8] 0x00d800 Byte 3 of 3
266fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
268fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
269fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
271fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
273fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
274fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
275fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
276fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
279fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
280fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
281fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
284fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
285fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
286fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
287fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Remap base[0] = (del, add, string_offset)
288fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
289fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville{0, 0, 0} };
290fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
291fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Remap string[0]
292fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned char utf8acceptnonsurrogates_remap_string[] = {
293fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0 };
294fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
295fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned char utf8acceptnonsurrogates_fast[256] = {
296fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
297fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
298fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
299fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
300fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
301fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
302fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
303fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
304fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
305fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
306fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
307fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
308fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
309fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
310fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
311fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
312fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
313fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
314fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
315fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
316fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
317fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const UTF8ScanObj utf8acceptnonsurrogates_obj = {
318fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_STATE0,
319fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_STATE0_SIZE,
320fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_TOTAL_SIZE,
321fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_MAX_EXPAND_X4,
322fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_SHIFT,
323fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_BYTES,
324fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_LOSUB,
325fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_HIADD,
326fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates,
327fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_remap_base,
328fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_remap_string,
329fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  utf8acceptnonsurrogates_fast
330fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
331fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
332fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
333fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef X__
334fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef RJ_
335fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S1_
336fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S2_
337fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S3_
338fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S21
339fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S31
340fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S32
341fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef T1_
342fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef T2_
343fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S11
344fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef SP_
345fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef D__
346fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef RJA
347fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
348fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Return true if current Tbl pointer is within state0 range
349fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Note that unsigned compare checks both ends of range simultaneously
350fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
351fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* Tbl0 = &st->state_table[st->state0];
352fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
353fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
354fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
355fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Scan a UTF-8 string based on state table.
356fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Always scan complete UTF-8 characters
357fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Set number of bytes scanned. Return reason for exiting
358fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleint UTF8GenericScan(const UTF8ScanObj* st,
359fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    const char * str,
360fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    int str_length,
361fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    int* bytes_consumed) {
362fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  *bytes_consumed = 0;
363fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  if (str_length == 0) return kExitOK;
364fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
365fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int eshift = st->entry_shift;
366fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* isrc = reinterpret_cast<const uint8*>(str);
367fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* src = isrc;
368fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* srclimit = isrc + str_length;
369fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* srclimit8 = srclimit - 7;
370fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* Tbl_0 = &st->state_table[st->state0];
371fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
372fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville DoAgain:
373fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Do state-table scan
374fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int e = 0;
375fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  uint8 c;
376fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* Tbl2 = &st->fast_state[0];
377d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  const uint32 losub = st->losub;
378d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  const uint32 hiadd = st->hiadd;
379d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  // Check initial few bytes one at a time until 8-byte aligned
380d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  //----------------------------
381d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  while ((((uintptr_t)src & 0x07) != 0) &&
382d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville         (src < srclimit) &&
383d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville         Tbl2[src[0]] == 0) {
384d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    src++;
385d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  }
386d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  if (((uintptr_t)src & 0x07) == 0) {
387d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    // Do fast for groups of 8 identity bytes.
388d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
389d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    // including slowing slightly on cr/lf/ht
390d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    //----------------------------
391d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    while (src < srclimit8) {
392d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
393d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
394d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      src += 8;
395d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
396d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
397d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville                    (s4567 - losub) | (s4567 + hiadd);
398d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      if ((temp & 0x80808080) != 0) {
399d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        // We typically end up here on cr/lf/ht; src was incremented
400d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
401d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville                    (Tbl2[src[-6]] | Tbl2[src[-5]]);
402d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        if (e0123 != 0) {
403d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville          src -= 8;
404d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville          break;
405d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        }    // Exit on Non-interchange
406d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
407d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville                (Tbl2[src[-2]] | Tbl2[src[-1]]);
408d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        if (e0123 != 0) {
409d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville          src -= 4;
410d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville          break;
411d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        }    // Exit on Non-interchange
412d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        // Else OK, go around again
413d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      }
414fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    }
415fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
416fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //----------------------------
417fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
418fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Byte-at-a-time scan
419fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //----------------------------
420fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* Tbl = Tbl_0;
421fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  while (src < srclimit) {
422fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    c = *src;
423fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    e = Tbl[c];
424fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    src++;
425fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    if (e >= kExitIllegalStructure) {break;}
426fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    Tbl = &Tbl_0[e << eshift];
427fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
428fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //----------------------------
429fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
430fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
431fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Exit posibilities:
432fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //  Some exit code, !state0, back up over last char
433fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //  Some exit code, state0, back up one byte exactly
434fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //  source consumed, !state0, back up over partial char
435fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //  source consumed, state0, exit OK
436fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // For illegal byte in state0, avoid backup up over PREVIOUS char
437fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // For truncated last char, back up to beginning of it
438fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
439fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  if (e >= kExitIllegalStructure) {
440fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Back up over exactly one byte of rejected/illegal UTF-8 character
441fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    src--;
442fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Back up more if needed
443fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    if (!InStateZero(st, Tbl)) {
444fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville      do {
445fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville        src--;
446fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville      } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
447fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    }
448fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  } else if (!InStateZero(st, Tbl)) {
449fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Back up over truncated UTF-8 character
450fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    e = kExitIllegalStructure;
451fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    do {
452fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville      src--;
453fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
454fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  } else {
455fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Normal termination, source fully consumed
456fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    e = kExitOK;
457fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
458fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
459fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  if (e == kExitDoAgain) {
460fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Loop back up to the fast scan
461fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    goto DoAgain;
462fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
463fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
464fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  *bytes_consumed = src - isrc;
465fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  return e;
466fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
467fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
468fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleint UTF8GenericScanFastAscii(const UTF8ScanObj* st,
469fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    const char * str,
470fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    int str_length,
471fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                    int* bytes_consumed) {
472fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  *bytes_consumed = 0;
473fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  if (str_length == 0) return kExitOK;
474fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
475fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* isrc =  reinterpret_cast<const uint8*>(str);
476fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* src = isrc;
477fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* srclimit = isrc + str_length;
478fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const uint8* srclimit8 = srclimit - 7;
479fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int n;
480fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int rest_consumed;
481fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int exit_reason;
482fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  do {
483d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    // Check initial few bytes one at a time until 8-byte aligned
484d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    while ((((uintptr_t)src & 0x07) != 0) &&
485d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville           (src < srclimit) && (src[0] < 0x80)) {
486d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      src++;
487d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    }
488d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville    if (((uintptr_t)src & 0x07) == 0) {
489d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      while ((src < srclimit8) &&
490d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville             (((reinterpret_cast<const uint32*>(src)[0] |
491d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville                reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
492d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville        src += 8;
493d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville      }
494fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    }
495fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    while ((src < srclimit) && (src[0] < 0x80)) {
496fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville      src++;
497fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    }
498fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Run state table on the rest
499fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    n = src - isrc;
500fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
501fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    src += rest_consumed;
502fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  } while ( exit_reason == kExitDoAgain );
503fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
504fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  *bytes_consumed = src - isrc;
505fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  return exit_reason;
506fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
507fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
508fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Hack:  On some compilers the static tables are initialized at startup.
509fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//   We can't use them until they are initialized.  However, some Protocol
510fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//   Buffer parsing happens at static init time and may try to validate
511fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//   UTF-8 strings.  Since UTF-8 validation is only used for debugging
512fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//   anyway, we simply always return success if initialization hasn't
513fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//   occurred yet.
514fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace {
515fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
516fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillebool module_initialized_ = false;
517fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
518fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestruct InitDetector {
519fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  InitDetector() {
520fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    module_initialized_ = true;
521fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
522fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
523fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleInitDetector init_detector;
524fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
525fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace
526fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
527fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillebool IsStructurallyValidUTF8(const char* buf, int len) {
528fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  if (!module_initialized_) return true;
529fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
530fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int bytes_consumed = 0;
531fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
532fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                           buf, len, &bytes_consumed);
533fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  return (bytes_consumed == len);
534fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
535fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
536b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammerint UTF8SpnStructurallyValid(const StringPiece& str) {
537b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  if (!module_initialized_) return str.size();
538b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer
539b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  int bytes_consumed = 0;
540b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
541b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer                           str.data(), str.size(), &bytes_consumed);
542b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  return bytes_consumed;
543b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer}
544b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer
545b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Coerce UTF-8 byte string in src_str to be
546b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// a structurally-valid equal-length string by selectively
547b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// overwriting illegal bytes with replace_char (typically blank).
548b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
549b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// src_str is read-only. If any overwriting is needed, a modified byte string
550b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// is created in idst, length isrclen.
551b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer//
552b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Returns pointer to output buffer, isrc if no changes were made,
553b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer//  or idst if some bytes were changed.
554b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer//
555b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Fast case: all is structurally valid and no byte copying is done.
556b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer//
557b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammerchar* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
558b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer                                    char* idst,
559b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer                                    const char replace_char) {
560b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  const char* isrc = src_str.data();
561b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  const int len = src_str.length();
562b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  int n = UTF8SpnStructurallyValid(src_str);
563b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  if (n == len) {               // Normal case -- all is cool, return
564b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    return const_cast<char*>(isrc);
565b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  } else {                      // Unusual case -- copy w/o bad bytes
566b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    const char* src = isrc;
567b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    const char* srclimit = isrc + len;
568b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    char* dst = idst;
569b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    memmove(dst, src, n);       // Copy initial good chunk
570b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    src += n;
571b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    dst += n;
572b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    while (src < srclimit) {    // src points to bogus byte or is off the end
573b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      dst[0] = replace_char;                    // replace one bad byte
574b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      src++;
575b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      dst++;
576b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      StringPiece str2(src, srclimit - src);
577b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      n = UTF8SpnStructurallyValid(str2);       // scan the remainder
578b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      memmove(dst, src, n);                     // copy next good chunk
579b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      src += n;
580b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer      dst += n;
581b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer    }
582b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  }
583b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer  return idst;
584b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer}
585b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer
586fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace internal
587fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace protobuf
588fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace google
589