1// Copyright 2005-2008 Google Inc. All Rights Reserved. 2// Author: jrm@google.com (Jim Meehan) 3 4#include <google/protobuf/stubs/common.h> 5 6namespace google { 7namespace protobuf { 8namespace internal { 9 10// These four-byte entries compactly encode how many bytes 0..255 to delete 11// in making a string replacement, how many bytes to add 0..255, and the offset 12// 0..64k-1 of the replacement string in remap_string. 13struct RemapEntry { 14 uint8 delete_bytes; 15 uint8 add_bytes; 16 uint16 bytes_offset; 17}; 18 19// Exit type codes for state tables. All but the first get stuffed into 20// signed one-byte entries. The first is only generated by executable code. 21// To distinguish from next-state entries, these must be contiguous and 22// all <= kExitNone 23typedef enum { 24 kExitDstSpaceFull = 239, 25 kExitIllegalStructure, // 240 26 kExitOK, // 241 27 kExitReject, // ... 28 kExitReplace1, 29 kExitReplace2, 30 kExitReplace3, 31 kExitReplace21, 32 kExitReplace31, 33 kExitReplace32, 34 kExitReplaceOffset1, 35 kExitReplaceOffset2, 36 kExitReplace1S0, 37 kExitSpecial, 38 kExitDoAgain, 39 kExitRejectAlt, 40 kExitNone // 255 41} ExitReason; 42 43 44// This struct represents one entire state table. The three initialized byte 45// areas are state_table, remap_base, and remap_string. state0 and state0_size 46// give the byte offset and length within state_table of the initial state -- 47// table lookups are expected to start and end in this state, but for 48// truncated UTF-8 strings, may end in a different state. These allow a quick 49// test for that condition. entry_shift is 8 for tables subscripted by a full 50// byte value and 6 for space-optimized tables subscripted by only six 51// significant bits in UTF-8 continuation bytes. 52typedef struct { 53 const uint32 state0; 54 const uint32 state0_size; 55 const uint32 total_size; 56 const int max_expand; 57 const int entry_shift; 58 const int bytes_per_entry; 59 const uint32 losub; 60 const uint32 hiadd; 61 const uint8* state_table; 62 const RemapEntry* remap_base; 63 const uint8* remap_string; 64 const uint8* fast_state; 65} UTF8StateMachineObj; 66 67typedef UTF8StateMachineObj UTF8ScanObj; 68 69#define X__ (kExitIllegalStructure) 70#define RJ_ (kExitReject) 71#define S1_ (kExitReplace1) 72#define S2_ (kExitReplace2) 73#define S3_ (kExitReplace3) 74#define S21 (kExitReplace21) 75#define S31 (kExitReplace31) 76#define S32 (kExitReplace32) 77#define T1_ (kExitReplaceOffset1) 78#define T2_ (kExitReplaceOffset2) 79#define S11 (kExitReplace1S0) 80#define SP_ (kExitSpecial) 81#define D__ (kExitDoAgain) 82#define RJA (kExitRejectAlt) 83 84// Entire table has 9 state blocks of 256 entries each 85static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0] 86static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1] 87static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304; 88static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0; 89static const unsigned int utf8acceptnonsurrogates_SHIFT = 8; 90static const unsigned int utf8acceptnonsurrogates_BYTES = 1; 91static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020; 92static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000; 93 94static const uint8 utf8acceptnonsurrogates[] = { 95// state[0] 0x000000 Byte 1 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 106X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 107X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 108X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 109X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 110 111X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 113 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 114 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 115 116// state[1] 0x000080 Byte 2 of 2 117X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 118X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 119X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 120X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 121 122X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 123X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 124X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 125X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 126 127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 132X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 133X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 134X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 135X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 136 137// state[2] 0x000000 Byte 2 of 3 138X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 139X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 140X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 141X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 142 143X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 144X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 145X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 146X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 147 148X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 149X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 152 153X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 154X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 155X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 156X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 157 158// state[3] 0x001000 Byte 2 of 3 159X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 160X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 161X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 162X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 163 164X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 165X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 166X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 167X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 168 169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 170 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 173 174X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 175X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 176X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 177X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 178 179// state[4] 0x000000 Byte 2 of 4 180X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 181X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 182X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 183X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 184 185X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 186X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 187X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 188X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 189 190X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 191 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 192 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 194 195X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 196X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 197X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 198X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 199 200// state[5] 0x040000 Byte 2 of 4 201X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 202X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 203X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 204X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 205 206X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 207X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 208X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 209X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 210 211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 212 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 213 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 215 216X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 217X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 218X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 219X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 220 221// state[6] 0x100000 Byte 2 of 4 222X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 223X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 224X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 225X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 226 227X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 228X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 229X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 230X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 231 232 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 233X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 234X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 235X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 236 237X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 238X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 239X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 240X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 241 242// state[7] 0x00d000 Byte 2 of 3 243X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 244X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 245X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 246X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 247 248X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 249X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 250X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 251X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 252 253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 255 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 256 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 257 258X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 259X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 260X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 261X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 262 263// state[8] 0x00d800 Byte 3 of 3 264X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 265X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 266X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 267X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 268 269X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 270X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 271X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 272X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 273 274RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 275RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 276RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 277RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 278 279X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 280X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 281X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 282X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 283}; 284 285// Remap base[0] = (del, add, string_offset) 286static const RemapEntry utf8acceptnonsurrogates_remap_base[] = { 287{0, 0, 0} }; 288 289// Remap string[0] 290static const unsigned char utf8acceptnonsurrogates_remap_string[] = { 2910 }; 292 293static const unsigned char utf8acceptnonsurrogates_fast[256] = { 2940, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 298 2990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3020, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 303 3041, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3051, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3061, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3071, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 308 3091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3101, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3111, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3121, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 313}; 314 315static const UTF8ScanObj utf8acceptnonsurrogates_obj = { 316 utf8acceptnonsurrogates_STATE0, 317 utf8acceptnonsurrogates_STATE0_SIZE, 318 utf8acceptnonsurrogates_TOTAL_SIZE, 319 utf8acceptnonsurrogates_MAX_EXPAND_X4, 320 utf8acceptnonsurrogates_SHIFT, 321 utf8acceptnonsurrogates_BYTES, 322 utf8acceptnonsurrogates_LOSUB, 323 utf8acceptnonsurrogates_HIADD, 324 utf8acceptnonsurrogates, 325 utf8acceptnonsurrogates_remap_base, 326 utf8acceptnonsurrogates_remap_string, 327 utf8acceptnonsurrogates_fast 328}; 329 330 331#undef X__ 332#undef RJ_ 333#undef S1_ 334#undef S2_ 335#undef S3_ 336#undef S21 337#undef S31 338#undef S32 339#undef T1_ 340#undef T2_ 341#undef S11 342#undef SP_ 343#undef D__ 344#undef RJA 345 346// Return true if current Tbl pointer is within state0 range 347// Note that unsigned compare checks both ends of range simultaneously 348static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 349 const uint8* Tbl0 = &st->state_table[st->state0]; 350 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 351} 352 353// Scan a UTF-8 string based on state table. 354// Always scan complete UTF-8 characters 355// Set number of bytes scanned. Return reason for exiting 356int UTF8GenericScan(const UTF8ScanObj* st, 357 const char * str, 358 int str_length, 359 int* bytes_consumed) { 360 *bytes_consumed = 0; 361 if (str_length == 0) return kExitOK; 362 363 int eshift = st->entry_shift; 364 const uint8* isrc = reinterpret_cast<const uint8*>(str); 365 const uint8* src = isrc; 366 const uint8* srclimit = isrc + str_length; 367 const uint8* srclimit8 = srclimit - 7; 368 const uint8* Tbl_0 = &st->state_table[st->state0]; 369 370 DoAgain: 371 // Do state-table scan 372 int e = 0; 373 uint8 c; 374 const uint8* Tbl2 = &st->fast_state[0]; 375 const uint32 losub = st->losub; 376 const uint32 hiadd = st->hiadd; 377 // Check initial few bytes one at a time until 8-byte aligned 378 //---------------------------- 379 while ((((uintptr_t)src & 0x07) != 0) && 380 (src < srclimit) && 381 Tbl2[src[0]] == 0) { 382 src++; 383 } 384 if (((uintptr_t)src & 0x07) == 0) { 385 // Do fast for groups of 8 identity bytes. 386 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, 387 // including slowing slightly on cr/lf/ht 388 //---------------------------- 389 while (src < srclimit8) { 390 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; 391 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; 392 src += 8; 393 // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 394 uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 395 (s4567 - losub) | (s4567 + hiadd); 396 if ((temp & 0x80808080) != 0) { 397 // We typically end up here on cr/lf/ht; src was incremented 398 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 399 (Tbl2[src[-6]] | Tbl2[src[-5]]); 400 if (e0123 != 0) { 401 src -= 8; 402 break; 403 } // Exit on Non-interchange 404 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 405 (Tbl2[src[-2]] | Tbl2[src[-1]]); 406 if (e0123 != 0) { 407 src -= 4; 408 break; 409 } // Exit on Non-interchange 410 // Else OK, go around again 411 } 412 } 413 } 414 //---------------------------- 415 416 // Byte-at-a-time scan 417 //---------------------------- 418 const uint8* Tbl = Tbl_0; 419 while (src < srclimit) { 420 c = *src; 421 e = Tbl[c]; 422 src++; 423 if (e >= kExitIllegalStructure) {break;} 424 Tbl = &Tbl_0[e << eshift]; 425 } 426 //---------------------------- 427 428 429 // Exit posibilities: 430 // Some exit code, !state0, back up over last char 431 // Some exit code, state0, back up one byte exactly 432 // source consumed, !state0, back up over partial char 433 // source consumed, state0, exit OK 434 // For illegal byte in state0, avoid backup up over PREVIOUS char 435 // For truncated last char, back up to beginning of it 436 437 if (e >= kExitIllegalStructure) { 438 // Back up over exactly one byte of rejected/illegal UTF-8 character 439 src--; 440 // Back up more if needed 441 if (!InStateZero(st, Tbl)) { 442 do { 443 src--; 444 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 445 } 446 } else if (!InStateZero(st, Tbl)) { 447 // Back up over truncated UTF-8 character 448 e = kExitIllegalStructure; 449 do { 450 src--; 451 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 452 } else { 453 // Normal termination, source fully consumed 454 e = kExitOK; 455 } 456 457 if (e == kExitDoAgain) { 458 // Loop back up to the fast scan 459 goto DoAgain; 460 } 461 462 *bytes_consumed = src - isrc; 463 return e; 464} 465 466int UTF8GenericScanFastAscii(const UTF8ScanObj* st, 467 const char * str, 468 int str_length, 469 int* bytes_consumed) { 470 *bytes_consumed = 0; 471 if (str_length == 0) return kExitOK; 472 473 const uint8* isrc = reinterpret_cast<const uint8*>(str); 474 const uint8* src = isrc; 475 const uint8* srclimit = isrc + str_length; 476 const uint8* srclimit8 = srclimit - 7; 477 int n; 478 int rest_consumed; 479 int exit_reason; 480 do { 481 // Check initial few bytes one at a time until 8-byte aligned 482 while ((((uintptr_t)src & 0x07) != 0) && 483 (src < srclimit) && (src[0] < 0x80)) { 484 src++; 485 } 486 if (((uintptr_t)src & 0x07) == 0) { 487 while ((src < srclimit8) && 488 (((reinterpret_cast<const uint32*>(src)[0] | 489 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { 490 src += 8; 491 } 492 } 493 while ((src < srclimit) && (src[0] < 0x80)) { 494 src++; 495 } 496 // Run state table on the rest 497 n = src - isrc; 498 exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed); 499 src += rest_consumed; 500 } while ( exit_reason == kExitDoAgain ); 501 502 *bytes_consumed = src - isrc; 503 return exit_reason; 504} 505 506// Hack: On some compilers the static tables are initialized at startup. 507// We can't use them until they are initialized. However, some Protocol 508// Buffer parsing happens at static init time and may try to validate 509// UTF-8 strings. Since UTF-8 validation is only used for debugging 510// anyway, we simply always return success if initialization hasn't 511// occurred yet. 512namespace { 513 514bool module_initialized_ = false; 515 516struct InitDetector { 517 InitDetector() { 518 module_initialized_ = true; 519 } 520}; 521InitDetector init_detector; 522 523} // namespace 524 525bool IsStructurallyValidUTF8(const char* buf, int len) { 526 if (!module_initialized_) return true; 527 528 int bytes_consumed = 0; 529 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 530 buf, len, &bytes_consumed); 531 return (bytes_consumed == len); 532} 533 534} // namespace internal 535} // namespace protobuf 536} // namespace google 537