structurally_valid.cc revision fbaaef999ba563838ebd00874ed8a1c01fbf286d
1// Copyright 2005-2008 Google Inc. All Rights Reserved. 2// Author: jrm@google.com (Jim Meehan) 3 4#include <google/protobuf/stubs/common.h> 5 6namespace google { 7namespace protobuf { 8namespace internal { 9 10// These four-byte entries compactly encode how many bytes 0..255 to delete 11// in making a string replacement, how many bytes to add 0..255, and the offset 12// 0..64k-1 of the replacement string in remap_string. 13struct RemapEntry { 14 uint8 delete_bytes; 15 uint8 add_bytes; 16 uint16 bytes_offset; 17}; 18 19// Exit type codes for state tables. All but the first get stuffed into 20// signed one-byte entries. The first is only generated by executable code. 21// To distinguish from next-state entries, these must be contiguous and 22// all <= kExitNone 23typedef enum { 24 kExitDstSpaceFull = 239, 25 kExitIllegalStructure, // 240 26 kExitOK, // 241 27 kExitReject, // ... 28 kExitReplace1, 29 kExitReplace2, 30 kExitReplace3, 31 kExitReplace21, 32 kExitReplace31, 33 kExitReplace32, 34 kExitReplaceOffset1, 35 kExitReplaceOffset2, 36 kExitReplace1S0, 37 kExitSpecial, 38 kExitDoAgain, 39 kExitRejectAlt, 40 kExitNone // 255 41} ExitReason; 42 43 44// This struct represents one entire state table. The three initialized byte 45// areas are state_table, remap_base, and remap_string. state0 and state0_size 46// give the byte offset and length within state_table of the initial state -- 47// table lookups are expected to start and end in this state, but for 48// truncated UTF-8 strings, may end in a different state. These allow a quick 49// test for that condition. entry_shift is 8 for tables subscripted by a full 50// byte value and 6 for space-optimized tables subscripted by only six 51// significant bits in UTF-8 continuation bytes. 52typedef struct { 53 const uint32 state0; 54 const uint32 state0_size; 55 const uint32 total_size; 56 const int max_expand; 57 const int entry_shift; 58 const int bytes_per_entry; 59 const uint32 losub; 60 const uint32 hiadd; 61 const uint8* state_table; 62 const RemapEntry* remap_base; 63 const uint8* remap_string; 64 const uint8* fast_state; 65} UTF8StateMachineObj; 66 67typedef UTF8StateMachineObj UTF8ScanObj; 68 69#define X__ (kExitIllegalStructure) 70#define RJ_ (kExitReject) 71#define S1_ (kExitReplace1) 72#define S2_ (kExitReplace2) 73#define S3_ (kExitReplace3) 74#define S21 (kExitReplace21) 75#define S31 (kExitReplace31) 76#define S32 (kExitReplace32) 77#define T1_ (kExitReplaceOffset1) 78#define T2_ (kExitReplaceOffset2) 79#define S11 (kExitReplace1S0) 80#define SP_ (kExitSpecial) 81#define D__ (kExitDoAgain) 82#define RJA (kExitRejectAlt) 83 84// Entire table has 9 state blocks of 256 entries each 85static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0] 86static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1] 87static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304; 88static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0; 89static const unsigned int utf8acceptnonsurrogates_SHIFT = 8; 90static const unsigned int utf8acceptnonsurrogates_BYTES = 1; 91static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020; 92static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000; 93 94static const uint8 utf8acceptnonsurrogates[] = { 95// state[0] 0x000000 Byte 1 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 106X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 107X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 108X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 109X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 110 111X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 113 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 114 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 115 116// state[1] 0x000080 Byte 2 of 2 117X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 118X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 119X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 120X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 121 122X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 123X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 124X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 125X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 126 127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 132X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 133X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 134X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 135X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 136 137// state[2] 0x000000 Byte 2 of 3 138X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 139X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 140X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 141X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 142 143X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 144X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 145X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 146X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 147 148X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 149X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 152 153X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 154X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 155X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 156X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 157 158// state[3] 0x001000 Byte 2 of 3 159X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 160X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 161X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 162X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 163 164X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 165X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 166X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 167X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 168 169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 170 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 173 174X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 175X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 176X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 177X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 178 179// state[4] 0x000000 Byte 2 of 4 180X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 181X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 182X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 183X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 184 185X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 186X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 187X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 188X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 189 190X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 191 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 192 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 194 195X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 196X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 197X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 198X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 199 200// state[5] 0x040000 Byte 2 of 4 201X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 202X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 203X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 204X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 205 206X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 207X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 208X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 209X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 210 211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 212 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 213 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 215 216X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 217X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 218X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 219X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 220 221// state[6] 0x100000 Byte 2 of 4 222X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 223X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 224X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 225X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 226 227X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 228X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 229X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 230X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 231 232 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 233X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 234X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 235X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 236 237X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 238X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 239X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 240X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 241 242// state[7] 0x00d000 Byte 2 of 3 243X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 244X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 245X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 246X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 247 248X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 249X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 250X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 251X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 252 253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 255 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 256 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 257 258X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 259X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 260X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 261X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 262 263// state[8] 0x00d800 Byte 3 of 3 264X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 265X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 266X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 267X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 268 269X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 270X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 271X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 272X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 273 274RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 275RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 276RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 277RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 278 279X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 280X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 281X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 282X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 283}; 284 285// Remap base[0] = (del, add, string_offset) 286static const RemapEntry utf8acceptnonsurrogates_remap_base[] = { 287{0, 0, 0} }; 288 289// Remap string[0] 290static const unsigned char utf8acceptnonsurrogates_remap_string[] = { 2910 }; 292 293static const unsigned char utf8acceptnonsurrogates_fast[256] = { 2940, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 298 2990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3020, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 303 3041, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3051, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3061, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3071, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 308 3091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3101, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3111, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3121, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 313}; 314 315static const UTF8ScanObj utf8acceptnonsurrogates_obj = { 316 utf8acceptnonsurrogates_STATE0, 317 utf8acceptnonsurrogates_STATE0_SIZE, 318 utf8acceptnonsurrogates_TOTAL_SIZE, 319 utf8acceptnonsurrogates_MAX_EXPAND_X4, 320 utf8acceptnonsurrogates_SHIFT, 321 utf8acceptnonsurrogates_BYTES, 322 utf8acceptnonsurrogates_LOSUB, 323 utf8acceptnonsurrogates_HIADD, 324 utf8acceptnonsurrogates, 325 utf8acceptnonsurrogates_remap_base, 326 utf8acceptnonsurrogates_remap_string, 327 utf8acceptnonsurrogates_fast 328}; 329 330 331#undef X__ 332#undef RJ_ 333#undef S1_ 334#undef S2_ 335#undef S3_ 336#undef S21 337#undef S31 338#undef S32 339#undef T1_ 340#undef T2_ 341#undef S11 342#undef SP_ 343#undef D__ 344#undef RJA 345 346// Return true if current Tbl pointer is within state0 range 347// Note that unsigned compare checks both ends of range simultaneously 348static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 349 const uint8* Tbl0 = &st->state_table[st->state0]; 350 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 351} 352 353// Scan a UTF-8 string based on state table. 354// Always scan complete UTF-8 characters 355// Set number of bytes scanned. Return reason for exiting 356int UTF8GenericScan(const UTF8ScanObj* st, 357 const char * str, 358 int str_length, 359 int* bytes_consumed) { 360 *bytes_consumed = 0; 361 if (str_length == 0) return kExitOK; 362 363 int eshift = st->entry_shift; 364 const uint8* isrc = reinterpret_cast<const uint8*>(str); 365 const uint8* src = isrc; 366 const uint8* srclimit = isrc + str_length; 367 const uint8* srclimit8 = srclimit - 7; 368 const uint8* Tbl_0 = &st->state_table[st->state0]; 369 370 DoAgain: 371 // Do state-table scan 372 int e = 0; 373 uint8 c; 374 375 // Do fast for groups of 8 identity bytes. 376 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, 377 // including slowing slightly on cr/lf/ht 378 //---------------------------- 379 const uint8* Tbl2 = &st->fast_state[0]; 380 uint32 losub = st->losub; 381 uint32 hiadd = st->hiadd; 382 while (src < srclimit8) { 383 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; 384 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; 385 src += 8; 386 // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 387 uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 388 (s4567 - losub) | (s4567 + hiadd); 389 if ((temp & 0x80808080) != 0) { 390 // We typically end up here on cr/lf/ht; src was incremented 391 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 392 (Tbl2[src[-6]] | Tbl2[src[-5]]); 393 if (e0123 != 0) { 394 src -= 8; 395 break; 396 } // Exit on Non-interchange 397 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 398 (Tbl2[src[-2]] | Tbl2[src[-1]]); 399 if (e0123 != 0) { 400 src -= 4; 401 break; 402 } // Exit on Non-interchange 403 // Else OK, go around again 404 } 405 } 406 //---------------------------- 407 408 // Byte-at-a-time scan 409 //---------------------------- 410 const uint8* Tbl = Tbl_0; 411 while (src < srclimit) { 412 c = *src; 413 e = Tbl[c]; 414 src++; 415 if (e >= kExitIllegalStructure) {break;} 416 Tbl = &Tbl_0[e << eshift]; 417 } 418 //---------------------------- 419 420 421 // Exit posibilities: 422 // Some exit code, !state0, back up over last char 423 // Some exit code, state0, back up one byte exactly 424 // source consumed, !state0, back up over partial char 425 // source consumed, state0, exit OK 426 // For illegal byte in state0, avoid backup up over PREVIOUS char 427 // For truncated last char, back up to beginning of it 428 429 if (e >= kExitIllegalStructure) { 430 // Back up over exactly one byte of rejected/illegal UTF-8 character 431 src--; 432 // Back up more if needed 433 if (!InStateZero(st, Tbl)) { 434 do { 435 src--; 436 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 437 } 438 } else if (!InStateZero(st, Tbl)) { 439 // Back up over truncated UTF-8 character 440 e = kExitIllegalStructure; 441 do { 442 src--; 443 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 444 } else { 445 // Normal termination, source fully consumed 446 e = kExitOK; 447 } 448 449 if (e == kExitDoAgain) { 450 // Loop back up to the fast scan 451 goto DoAgain; 452 } 453 454 *bytes_consumed = src - isrc; 455 return e; 456} 457 458int UTF8GenericScanFastAscii(const UTF8ScanObj* st, 459 const char * str, 460 int str_length, 461 int* bytes_consumed) { 462 *bytes_consumed = 0; 463 if (str_length == 0) return kExitOK; 464 465 const uint8* isrc = reinterpret_cast<const uint8*>(str); 466 const uint8* src = isrc; 467 const uint8* srclimit = isrc + str_length; 468 const uint8* srclimit8 = srclimit - 7; 469 int n; 470 int rest_consumed; 471 int exit_reason; 472 do { 473 while ((src < srclimit8) && 474 (((reinterpret_cast<const uint32*>(src)[0] | 475 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { 476 src += 8; 477 } 478 while ((src < srclimit) && (src[0] < 0x80)) { 479 src++; 480 } 481 // Run state table on the rest 482 n = src - isrc; 483 exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed); 484 src += rest_consumed; 485 } while ( exit_reason == kExitDoAgain ); 486 487 *bytes_consumed = src - isrc; 488 return exit_reason; 489} 490 491// Hack: On some compilers the static tables are initialized at startup. 492// We can't use them until they are initialized. However, some Protocol 493// Buffer parsing happens at static init time and may try to validate 494// UTF-8 strings. Since UTF-8 validation is only used for debugging 495// anyway, we simply always return success if initialization hasn't 496// occurred yet. 497namespace { 498 499bool module_initialized_ = false; 500 501struct InitDetector { 502 InitDetector() { 503 module_initialized_ = true; 504 } 505}; 506InitDetector init_detector; 507 508} // namespace 509 510bool IsStructurallyValidUTF8(const char* buf, int len) { 511 if (!module_initialized_) return true; 512 513 int bytes_consumed = 0; 514 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 515 buf, len, &bytes_consumed); 516 return (bytes_consumed == len); 517} 518 519} // namespace internal 520} // namespace protobuf 521} // namespace google 522