1// Copyright 2005-2008 Google Inc. All Rights Reserved. 2// Author: jrm@google.com (Jim Meehan) 3 4#include <google/protobuf/stubs/common.h> 5 6#include <google/protobuf/stubs/stringpiece.h> 7 8namespace google { 9namespace protobuf { 10namespace internal { 11 12// These four-byte entries compactly encode how many bytes 0..255 to delete 13// in making a string replacement, how many bytes to add 0..255, and the offset 14// 0..64k-1 of the replacement string in remap_string. 15struct RemapEntry { 16 uint8 delete_bytes; 17 uint8 add_bytes; 18 uint16 bytes_offset; 19}; 20 21// Exit type codes for state tables. All but the first get stuffed into 22// signed one-byte entries. The first is only generated by executable code. 23// To distinguish from next-state entries, these must be contiguous and 24// all <= kExitNone 25typedef enum { 26 kExitDstSpaceFull = 239, 27 kExitIllegalStructure, // 240 28 kExitOK, // 241 29 kExitReject, // ... 30 kExitReplace1, 31 kExitReplace2, 32 kExitReplace3, 33 kExitReplace21, 34 kExitReplace31, 35 kExitReplace32, 36 kExitReplaceOffset1, 37 kExitReplaceOffset2, 38 kExitReplace1S0, 39 kExitSpecial, 40 kExitDoAgain, 41 kExitRejectAlt, 42 kExitNone // 255 43} ExitReason; 44 45 46// This struct represents one entire state table. The three initialized byte 47// areas are state_table, remap_base, and remap_string. state0 and state0_size 48// give the byte offset and length within state_table of the initial state -- 49// table lookups are expected to start and end in this state, but for 50// truncated UTF-8 strings, may end in a different state. These allow a quick 51// test for that condition. entry_shift is 8 for tables subscripted by a full 52// byte value and 6 for space-optimized tables subscripted by only six 53// significant bits in UTF-8 continuation bytes. 54typedef struct { 55 const uint32 state0; 56 const uint32 state0_size; 57 const uint32 total_size; 58 const int max_expand; 59 const int entry_shift; 60 const int bytes_per_entry; 61 const uint32 losub; 62 const uint32 hiadd; 63 const uint8* state_table; 64 const RemapEntry* remap_base; 65 const uint8* remap_string; 66 const uint8* fast_state; 67} UTF8StateMachineObj; 68 69typedef UTF8StateMachineObj UTF8ScanObj; 70 71#define X__ (kExitIllegalStructure) 72#define RJ_ (kExitReject) 73#define S1_ (kExitReplace1) 74#define S2_ (kExitReplace2) 75#define S3_ (kExitReplace3) 76#define S21 (kExitReplace21) 77#define S31 (kExitReplace31) 78#define S32 (kExitReplace32) 79#define T1_ (kExitReplaceOffset1) 80#define T2_ (kExitReplaceOffset2) 81#define S11 (kExitReplace1S0) 82#define SP_ (kExitSpecial) 83#define D__ (kExitDoAgain) 84#define RJA (kExitRejectAlt) 85 86// Entire table has 9 state blocks of 256 entries each 87static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0] 88static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1] 89static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304; 90static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0; 91static const unsigned int utf8acceptnonsurrogates_SHIFT = 8; 92static const unsigned int utf8acceptnonsurrogates_BYTES = 1; 93static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020; 94static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000; 95 96static const uint8 utf8acceptnonsurrogates[] = { 97// state[0] 0x000000 Byte 1 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 107 108X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 109X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 110X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 111X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 112 113X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 114 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 115 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 116 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 117 118// state[1] 0x000080 Byte 2 of 2 119X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 120X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 121X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 122X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 123 124X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 125X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 126X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 127X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 128 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133 134X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 135X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 136X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 137X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 138 139// state[2] 0x000000 Byte 2 of 3 140X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 141X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 142X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 143X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 144 145X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 146X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 147X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 148X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 149 150X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 151X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 154 155X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 156X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 157X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 158X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 159 160// state[3] 0x001000 Byte 2 of 3 161X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 162X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 163X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 164X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 165 166X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 167X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 168X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 169X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 170 171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 175 176X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 177X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 178X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 179X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 180 181// state[4] 0x000000 Byte 2 of 4 182X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 183X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 184X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 185X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 186 187X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 188X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 189X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 190X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 191 192X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 194 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 195 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 196 197X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 198X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 199X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 200X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 201 202// state[5] 0x040000 Byte 2 of 4 203X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 204X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 205X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 206X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 207 208X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 209X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 210X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 211X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 212 213 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 216 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 217 218X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 219X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 220X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 221X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 222 223// state[6] 0x100000 Byte 2 of 4 224X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 225X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 226X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 227X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 228 229X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 230X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 231X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 232X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 233 234 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 235X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 236X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 237X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 238 239X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 240X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 241X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 242X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 243 244// state[7] 0x00d000 Byte 2 of 3 245X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 246X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 247X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 248X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 249 250X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 251X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 252X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 253X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 254 255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 256 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 257 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 258 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 259 260X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 261X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 262X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 263X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 264 265// state[8] 0x00d800 Byte 3 of 3 266X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 267X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 268X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 269X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 270 271X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 272X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 273X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 274X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 275 276RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 277RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 278RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 279RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 280 281X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 282X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 283X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 284X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 285}; 286 287// Remap base[0] = (del, add, string_offset) 288static const RemapEntry utf8acceptnonsurrogates_remap_base[] = { 289{0, 0, 0} }; 290 291// Remap string[0] 292static const unsigned char utf8acceptnonsurrogates_remap_string[] = { 2930 }; 294 295static const unsigned char utf8acceptnonsurrogates_fast[256] = { 2960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2980, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 300 3010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3020, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3030, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3040, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 305 3061, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3071, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3081, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 310 3111, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3121, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3131, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3141, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 315}; 316 317static const UTF8ScanObj utf8acceptnonsurrogates_obj = { 318 utf8acceptnonsurrogates_STATE0, 319 utf8acceptnonsurrogates_STATE0_SIZE, 320 utf8acceptnonsurrogates_TOTAL_SIZE, 321 utf8acceptnonsurrogates_MAX_EXPAND_X4, 322 utf8acceptnonsurrogates_SHIFT, 323 utf8acceptnonsurrogates_BYTES, 324 utf8acceptnonsurrogates_LOSUB, 325 utf8acceptnonsurrogates_HIADD, 326 utf8acceptnonsurrogates, 327 utf8acceptnonsurrogates_remap_base, 328 utf8acceptnonsurrogates_remap_string, 329 utf8acceptnonsurrogates_fast 330}; 331 332 333#undef X__ 334#undef RJ_ 335#undef S1_ 336#undef S2_ 337#undef S3_ 338#undef S21 339#undef S31 340#undef S32 341#undef T1_ 342#undef T2_ 343#undef S11 344#undef SP_ 345#undef D__ 346#undef RJA 347 348// Return true if current Tbl pointer is within state0 range 349// Note that unsigned compare checks both ends of range simultaneously 350static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 351 const uint8* Tbl0 = &st->state_table[st->state0]; 352 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 353} 354 355// Scan a UTF-8 string based on state table. 356// Always scan complete UTF-8 characters 357// Set number of bytes scanned. Return reason for exiting 358int UTF8GenericScan(const UTF8ScanObj* st, 359 const char * str, 360 int str_length, 361 int* bytes_consumed) { 362 *bytes_consumed = 0; 363 if (str_length == 0) return kExitOK; 364 365 int eshift = st->entry_shift; 366 const uint8* isrc = reinterpret_cast<const uint8*>(str); 367 const uint8* src = isrc; 368 const uint8* srclimit = isrc + str_length; 369 const uint8* srclimit8 = srclimit - 7; 370 const uint8* Tbl_0 = &st->state_table[st->state0]; 371 372 DoAgain: 373 // Do state-table scan 374 int e = 0; 375 uint8 c; 376 const uint8* Tbl2 = &st->fast_state[0]; 377 const uint32 losub = st->losub; 378 const uint32 hiadd = st->hiadd; 379 // Check initial few bytes one at a time until 8-byte aligned 380 //---------------------------- 381 while ((((uintptr_t)src & 0x07) != 0) && 382 (src < srclimit) && 383 Tbl2[src[0]] == 0) { 384 src++; 385 } 386 if (((uintptr_t)src & 0x07) == 0) { 387 // Do fast for groups of 8 identity bytes. 388 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, 389 // including slowing slightly on cr/lf/ht 390 //---------------------------- 391 while (src < srclimit8) { 392 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; 393 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; 394 src += 8; 395 // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 396 uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 397 (s4567 - losub) | (s4567 + hiadd); 398 if ((temp & 0x80808080) != 0) { 399 // We typically end up here on cr/lf/ht; src was incremented 400 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 401 (Tbl2[src[-6]] | Tbl2[src[-5]]); 402 if (e0123 != 0) { 403 src -= 8; 404 break; 405 } // Exit on Non-interchange 406 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 407 (Tbl2[src[-2]] | Tbl2[src[-1]]); 408 if (e0123 != 0) { 409 src -= 4; 410 break; 411 } // Exit on Non-interchange 412 // Else OK, go around again 413 } 414 } 415 } 416 //---------------------------- 417 418 // Byte-at-a-time scan 419 //---------------------------- 420 const uint8* Tbl = Tbl_0; 421 while (src < srclimit) { 422 c = *src; 423 e = Tbl[c]; 424 src++; 425 if (e >= kExitIllegalStructure) {break;} 426 Tbl = &Tbl_0[e << eshift]; 427 } 428 //---------------------------- 429 430 431 // Exit posibilities: 432 // Some exit code, !state0, back up over last char 433 // Some exit code, state0, back up one byte exactly 434 // source consumed, !state0, back up over partial char 435 // source consumed, state0, exit OK 436 // For illegal byte in state0, avoid backup up over PREVIOUS char 437 // For truncated last char, back up to beginning of it 438 439 if (e >= kExitIllegalStructure) { 440 // Back up over exactly one byte of rejected/illegal UTF-8 character 441 src--; 442 // Back up more if needed 443 if (!InStateZero(st, Tbl)) { 444 do { 445 src--; 446 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 447 } 448 } else if (!InStateZero(st, Tbl)) { 449 // Back up over truncated UTF-8 character 450 e = kExitIllegalStructure; 451 do { 452 src--; 453 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 454 } else { 455 // Normal termination, source fully consumed 456 e = kExitOK; 457 } 458 459 if (e == kExitDoAgain) { 460 // Loop back up to the fast scan 461 goto DoAgain; 462 } 463 464 *bytes_consumed = src - isrc; 465 return e; 466} 467 468int UTF8GenericScanFastAscii(const UTF8ScanObj* st, 469 const char * str, 470 int str_length, 471 int* bytes_consumed) { 472 *bytes_consumed = 0; 473 if (str_length == 0) return kExitOK; 474 475 const uint8* isrc = reinterpret_cast<const uint8*>(str); 476 const uint8* src = isrc; 477 const uint8* srclimit = isrc + str_length; 478 const uint8* srclimit8 = srclimit - 7; 479 int n; 480 int rest_consumed; 481 int exit_reason; 482 do { 483 // Check initial few bytes one at a time until 8-byte aligned 484 while ((((uintptr_t)src & 0x07) != 0) && 485 (src < srclimit) && (src[0] < 0x80)) { 486 src++; 487 } 488 if (((uintptr_t)src & 0x07) == 0) { 489 while ((src < srclimit8) && 490 (((reinterpret_cast<const uint32*>(src)[0] | 491 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { 492 src += 8; 493 } 494 } 495 while ((src < srclimit) && (src[0] < 0x80)) { 496 src++; 497 } 498 // Run state table on the rest 499 n = src - isrc; 500 exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed); 501 src += rest_consumed; 502 } while ( exit_reason == kExitDoAgain ); 503 504 *bytes_consumed = src - isrc; 505 return exit_reason; 506} 507 508// Hack: On some compilers the static tables are initialized at startup. 509// We can't use them until they are initialized. However, some Protocol 510// Buffer parsing happens at static init time and may try to validate 511// UTF-8 strings. Since UTF-8 validation is only used for debugging 512// anyway, we simply always return success if initialization hasn't 513// occurred yet. 514namespace { 515 516bool module_initialized_ = false; 517 518struct InitDetector { 519 InitDetector() { 520 module_initialized_ = true; 521 } 522}; 523InitDetector init_detector; 524 525} // namespace 526 527bool IsStructurallyValidUTF8(const char* buf, int len) { 528 if (!module_initialized_) return true; 529 530 int bytes_consumed = 0; 531 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 532 buf, len, &bytes_consumed); 533 return (bytes_consumed == len); 534} 535 536int UTF8SpnStructurallyValid(const StringPiece& str) { 537 if (!module_initialized_) return str.size(); 538 539 int bytes_consumed = 0; 540 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 541 str.data(), str.size(), &bytes_consumed); 542 return bytes_consumed; 543} 544 545// Coerce UTF-8 byte string in src_str to be 546// a structurally-valid equal-length string by selectively 547// overwriting illegal bytes with replace_char (typically blank). 548// replace_char must be legal printable 7-bit Ascii 0x20..0x7e. 549// src_str is read-only. If any overwriting is needed, a modified byte string 550// is created in idst, length isrclen. 551// 552// Returns pointer to output buffer, isrc if no changes were made, 553// or idst if some bytes were changed. 554// 555// Fast case: all is structurally valid and no byte copying is done. 556// 557char* UTF8CoerceToStructurallyValid(const StringPiece& src_str, 558 char* idst, 559 const char replace_char) { 560 const char* isrc = src_str.data(); 561 const int len = src_str.length(); 562 int n = UTF8SpnStructurallyValid(src_str); 563 if (n == len) { // Normal case -- all is cool, return 564 return const_cast<char*>(isrc); 565 } else { // Unusual case -- copy w/o bad bytes 566 const char* src = isrc; 567 const char* srclimit = isrc + len; 568 char* dst = idst; 569 memmove(dst, src, n); // Copy initial good chunk 570 src += n; 571 dst += n; 572 while (src < srclimit) { // src points to bogus byte or is off the end 573 dst[0] = replace_char; // replace one bad byte 574 src++; 575 dst++; 576 StringPiece str2(src, srclimit - src); 577 n = UTF8SpnStructurallyValid(str2); // scan the remainder 578 memmove(dst, src, n); // copy next good chunk 579 src += n; 580 dst += n; 581 } 582 } 583 return idst; 584} 585 586} // namespace internal 587} // namespace protobuf 588} // namespace google 589