1fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Copyright 2005-2008 Google Inc. All Rights Reserved. 2fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Author: jrm@google.com (Jim Meehan) 3fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 4fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#include <google/protobuf/stubs/common.h> 5fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 6b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer#include <google/protobuf/stubs/stringpiece.h> 7b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer 8fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace google { 9fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace protobuf { 10fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace internal { 11fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 12fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// These four-byte entries compactly encode how many bytes 0..255 to delete 13fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// in making a string replacement, how many bytes to add 0..255, and the offset 14fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// 0..64k-1 of the replacement string in remap_string. 15fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestruct RemapEntry { 16fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville uint8 delete_bytes; 17fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville uint8 add_bytes; 18fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville uint16 bytes_offset; 19fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}; 20fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 21fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Exit type codes for state tables. All but the first get stuffed into 22fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// signed one-byte entries. The first is only generated by executable code. 23fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// To distinguish from next-state entries, these must be contiguous and 24fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// all <= kExitNone 25fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef enum { 26fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitDstSpaceFull = 239, 27fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitIllegalStructure, // 240 28fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitOK, // 241 29fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReject, // ... 30fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace1, 31fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace2, 32fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace3, 33fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace21, 34fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace31, 35fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace32, 36fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplaceOffset1, 37fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplaceOffset2, 38fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitReplace1S0, 39fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitSpecial, 40fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitDoAgain, 41fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitRejectAlt, 42fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville kExitNone // 255 43fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} ExitReason; 44fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 45fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 46fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// This struct represents one entire state table. The three initialized byte 47fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// areas are state_table, remap_base, and remap_string. state0 and state0_size 48fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// give the byte offset and length within state_table of the initial state -- 49fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// table lookups are expected to start and end in this state, but for 50fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// truncated UTF-8 strings, may end in a different state. These allow a quick 51fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// test for that condition. entry_shift is 8 for tables subscripted by a full 52fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// byte value and 6 for space-optimized tables subscripted by only six 53fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// significant bits in UTF-8 continuation bytes. 54fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef struct { 55fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint32 state0; 56fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint32 state0_size; 57fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint32 total_size; 58fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const int max_expand; 59fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const int entry_shift; 60fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const int bytes_per_entry; 61fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint32 losub; 62fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint32 hiadd; 63fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* state_table; 64fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const RemapEntry* remap_base; 65fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* remap_string; 66fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* fast_state; 67fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} UTF8StateMachineObj; 68fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 69fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilletypedef UTF8StateMachineObj UTF8ScanObj; 70fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 71fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define X__ (kExitIllegalStructure) 72fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define RJ_ (kExitReject) 73fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S1_ (kExitReplace1) 74fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S2_ (kExitReplace2) 75fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S3_ (kExitReplace3) 76fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S21 (kExitReplace21) 77fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S31 (kExitReplace31) 78fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S32 (kExitReplace32) 79fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define T1_ (kExitReplaceOffset1) 80fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define T2_ (kExitReplaceOffset2) 81fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define S11 (kExitReplace1S0) 82fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define SP_ (kExitSpecial) 83fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define D__ (kExitDoAgain) 84fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define RJA (kExitRejectAlt) 85fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 86fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Entire table has 9 state blocks of 256 entries each 87fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0] 88fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1] 89fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304; 90fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0; 91fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_SHIFT = 8; 92fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_BYTES = 1; 93fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020; 94fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000; 95fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 96fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const uint8 utf8acceptnonsurrogates[] = { 97fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[0] 0x000000 Byte 1 98fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 103fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 106fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 107fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 108fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 109fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 110fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 111fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 112fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 113fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 114fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 115fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 116fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 117fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 118fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[1] 0x000080 Byte 2 of 2 119fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 120fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 121fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 122fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 123fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 124fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 125fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 126fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 127fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 128fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 129fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 134fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 135fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 136fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 137fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 138fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 139fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[2] 0x000000 Byte 2 of 3 140fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 141fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 142fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 143fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 144fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 145fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 146fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 147fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 148fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 149fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 150fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 151fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 152fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 153fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 154fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 155fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 156fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 157fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 158fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 159fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 160fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[3] 0x001000 Byte 2 of 3 161fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 162fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 163fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 164fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 165fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 166fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 167fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 168fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 169fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 170fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 171fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 172fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 173fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 174fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 175fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 176fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 177fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 178fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 179fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 180fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 181fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[4] 0x000000 Byte 2 of 4 182fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 183fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 184fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 185fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 186fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 187fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 188fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 189fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 190fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 191fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 192fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 193fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 194fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 195fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 196fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 197fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 198fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 199fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 200fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 201fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 202fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[5] 0x040000 Byte 2 of 4 203fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 204fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 205fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 206fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 207fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 208fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 209fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 210fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 211fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 212fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 213fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 214fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 215fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 216fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 217fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 218fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 219fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 220fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 221fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 222fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 223fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[6] 0x100000 Byte 2 of 4 224fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 225fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 226fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 227fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 228fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 229fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 230fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 231fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 232fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 233fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 234fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 235fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 236fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 237fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 238fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 239fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 240fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 241fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 242fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 243fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 244fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[7] 0x00d000 Byte 2 of 3 245fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 246fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 247fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 248fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 249fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 250fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 251fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 252fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 253fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 254fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 255fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 256fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 257fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 258fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 259fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 260fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 261fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 262fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 263fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 264fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 265fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// state[8] 0x00d800 Byte 3 of 3 266fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 267fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 268fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 269fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 270fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 271fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 272fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 273fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 274fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 275fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 276fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 277fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 278fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 279fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleRJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, 280fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 281fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 282fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 283fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 284fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleX__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, 285fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}; 286fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 287fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Remap base[0] = (del, add, string_offset) 288fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const RemapEntry utf8acceptnonsurrogates_remap_base[] = { 289fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville{0, 0, 0} }; 290fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 291fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Remap string[0] 292fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned char utf8acceptnonsurrogates_remap_string[] = { 293fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0 }; 294fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 295fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const unsigned char utf8acceptnonsurrogates_fast[256] = { 296fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 297fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 298fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 299fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 300fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 301fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 302fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 303fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 304fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 305fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 306fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 307fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 308fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 309fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 310fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 311fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 312fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 313fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 314fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 315fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}; 316fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 317fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic const UTF8ScanObj utf8acceptnonsurrogates_obj = { 318fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_STATE0, 319fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_STATE0_SIZE, 320fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_TOTAL_SIZE, 321fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_MAX_EXPAND_X4, 322fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_SHIFT, 323fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_BYTES, 324fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_LOSUB, 325fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_HIADD, 326fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates, 327fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_remap_base, 328fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_remap_string, 329fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville utf8acceptnonsurrogates_fast 330fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}; 331fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 332fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 333fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef X__ 334fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef RJ_ 335fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S1_ 336fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S2_ 337fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S3_ 338fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S21 339fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S31 340fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S32 341fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef T1_ 342fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef T2_ 343fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef S11 344fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef SP_ 345fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef D__ 346fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#undef RJA 347fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 348fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Return true if current Tbl pointer is within state0 range 349fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Note that unsigned compare checks both ends of range simultaneously 350fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestatic inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 351fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* Tbl0 = &st->state_table[st->state0]; 352fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 353fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} 354fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 355fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Scan a UTF-8 string based on state table. 356fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Always scan complete UTF-8 characters 357fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Set number of bytes scanned. Return reason for exiting 358fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleint UTF8GenericScan(const UTF8ScanObj* st, 359fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const char * str, 360fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int str_length, 361fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int* bytes_consumed) { 362fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville *bytes_consumed = 0; 363fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (str_length == 0) return kExitOK; 364fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 365fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int eshift = st->entry_shift; 366fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* isrc = reinterpret_cast<const uint8*>(str); 367fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* src = isrc; 368fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* srclimit = isrc + str_length; 369fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* srclimit8 = srclimit - 7; 370fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* Tbl_0 = &st->state_table[st->state0]; 371fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 372fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville DoAgain: 373fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Do state-table scan 374fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int e = 0; 375fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville uint8 c; 376fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* Tbl2 = &st->fast_state[0]; 377d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville const uint32 losub = st->losub; 378d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville const uint32 hiadd = st->hiadd; 379d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // Check initial few bytes one at a time until 8-byte aligned 380d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville //---------------------------- 381d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville while ((((uintptr_t)src & 0x07) != 0) && 382d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (src < srclimit) && 383d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville Tbl2[src[0]] == 0) { 384d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src++; 385d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } 386d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville if (((uintptr_t)src & 0x07) == 0) { 387d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // Do fast for groups of 8 identity bytes. 388d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, 389d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // including slowing slightly on cr/lf/ht 390d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville //---------------------------- 391d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville while (src < srclimit8) { 392d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; 393d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; 394d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src += 8; 395d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 396d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 397d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (s4567 - losub) | (s4567 + hiadd); 398d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville if ((temp & 0x80808080) != 0) { 399d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // We typically end up here on cr/lf/ht; src was incremented 400d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 401d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (Tbl2[src[-6]] | Tbl2[src[-5]]); 402d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville if (e0123 != 0) { 403d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src -= 8; 404d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville break; 405d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } // Exit on Non-interchange 406d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 407d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (Tbl2[src[-2]] | Tbl2[src[-1]]); 408d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville if (e0123 != 0) { 409d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src -= 4; 410d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville break; 411d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } // Exit on Non-interchange 412d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // Else OK, go around again 413d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } 414fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 415fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 416fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville //---------------------------- 417fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 418fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Byte-at-a-time scan 419fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville //---------------------------- 420fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* Tbl = Tbl_0; 421fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville while (src < srclimit) { 422fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville c = *src; 423fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville e = Tbl[c]; 424fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src++; 425fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (e >= kExitIllegalStructure) {break;} 426fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville Tbl = &Tbl_0[e << eshift]; 427fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 428fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville //---------------------------- 429fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 430fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 431fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Exit posibilities: 432fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Some exit code, !state0, back up over last char 433fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Some exit code, state0, back up one byte exactly 434fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // source consumed, !state0, back up over partial char 435fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // source consumed, state0, exit OK 436fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // For illegal byte in state0, avoid backup up over PREVIOUS char 437fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // For truncated last char, back up to beginning of it 438fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 439fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (e >= kExitIllegalStructure) { 440fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Back up over exactly one byte of rejected/illegal UTF-8 character 441fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src--; 442fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Back up more if needed 443fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (!InStateZero(st, Tbl)) { 444fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville do { 445fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src--; 446fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 447fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 448fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } else if (!InStateZero(st, Tbl)) { 449fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Back up over truncated UTF-8 character 450fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville e = kExitIllegalStructure; 451fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville do { 452fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src--; 453fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 454fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } else { 455fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Normal termination, source fully consumed 456fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville e = kExitOK; 457fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 458fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 459fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (e == kExitDoAgain) { 460fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Loop back up to the fast scan 461fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville goto DoAgain; 462fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 463fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 464fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville *bytes_consumed = src - isrc; 465fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville return e; 466fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} 467fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 468fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleint UTF8GenericScanFastAscii(const UTF8ScanObj* st, 469fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const char * str, 470fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int str_length, 471fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int* bytes_consumed) { 472fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville *bytes_consumed = 0; 473fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (str_length == 0) return kExitOK; 474fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 475fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* isrc = reinterpret_cast<const uint8*>(str); 476fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* src = isrc; 477fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* srclimit = isrc + str_length; 478fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville const uint8* srclimit8 = srclimit - 7; 479fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int n; 480fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int rest_consumed; 481fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int exit_reason; 482fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville do { 483d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville // Check initial few bytes one at a time until 8-byte aligned 484d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville while ((((uintptr_t)src & 0x07) != 0) && 485d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (src < srclimit) && (src[0] < 0x80)) { 486d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src++; 487d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } 488d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville if (((uintptr_t)src & 0x07) == 0) { 489d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville while ((src < srclimit8) && 490d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville (((reinterpret_cast<const uint32*>(src)[0] | 491d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { 492d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville src += 8; 493d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville } 494fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 495fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville while ((src < srclimit) && (src[0] < 0x80)) { 496fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src++; 497fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 498fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville // Run state table on the rest 499fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville n = src - isrc; 500fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed); 501fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville src += rest_consumed; 502fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } while ( exit_reason == kExitDoAgain ); 503fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 504fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville *bytes_consumed = src - isrc; 505fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville return exit_reason; 506fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} 507fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 508fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Hack: On some compilers the static tables are initialized at startup. 509fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// We can't use them until they are initialized. However, some Protocol 510fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Buffer parsing happens at static init time and may try to validate 511fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// UTF-8 strings. Since UTF-8 validation is only used for debugging 512fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// anyway, we simply always return success if initialization hasn't 513fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// occurred yet. 514fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace { 515fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 516fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillebool module_initialized_ = false; 517fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 518fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillestruct InitDetector { 519fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville InitDetector() { 520fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville module_initialized_ = true; 521fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville } 522fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}; 523fbaaef999ba563838ebd00874ed8a1c01fbf286dWink SavilleInitDetector init_detector; 524fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 525fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} // namespace 526fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 527fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillebool IsStructurallyValidUTF8(const char* buf, int len) { 528fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville if (!module_initialized_) return true; 529fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 530fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville int bytes_consumed = 0; 531fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 532fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville buf, len, &bytes_consumed); 533fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville return (bytes_consumed == len); 534fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} 535fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville 536b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammerint UTF8SpnStructurallyValid(const StringPiece& str) { 537b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer if (!module_initialized_) return str.size(); 538b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer 539b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer int bytes_consumed = 0; 540b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, 541b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer str.data(), str.size(), &bytes_consumed); 542b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer return bytes_consumed; 543b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer} 544b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer 545b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Coerce UTF-8 byte string in src_str to be 546b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// a structurally-valid equal-length string by selectively 547b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// overwriting illegal bytes with replace_char (typically blank). 548b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// replace_char must be legal printable 7-bit Ascii 0x20..0x7e. 549b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// src_str is read-only. If any overwriting is needed, a modified byte string 550b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// is created in idst, length isrclen. 551b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// 552b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Returns pointer to output buffer, isrc if no changes were made, 553b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// or idst if some bytes were changed. 554b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// 555b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// Fast case: all is structurally valid and no byte copying is done. 556b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer// 557b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammerchar* UTF8CoerceToStructurallyValid(const StringPiece& src_str, 558b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer char* idst, 559b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer const char replace_char) { 560b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer const char* isrc = src_str.data(); 561b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer const int len = src_str.length(); 562b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer int n = UTF8SpnStructurallyValid(src_str); 563b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer if (n == len) { // Normal case -- all is cool, return 564b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer return const_cast<char*>(isrc); 565b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer } else { // Unusual case -- copy w/o bad bytes 566b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer const char* src = isrc; 567b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer const char* srclimit = isrc + len; 568b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer char* dst = idst; 569b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer memmove(dst, src, n); // Copy initial good chunk 570b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer src += n; 571b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer dst += n; 572b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer while (src < srclimit) { // src points to bogus byte or is off the end 573b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer dst[0] = replace_char; // replace one bad byte 574b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer src++; 575b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer dst++; 576b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer StringPiece str2(src, srclimit - src); 577b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer n = UTF8SpnStructurallyValid(str2); // scan the remainder 578b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer memmove(dst, src, n); // copy next good chunk 579b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer src += n; 580b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer dst += n; 581b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer } 582b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer } 583b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer return idst; 584b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer} 585b0575e93e4c39dec69365b850088a1eb7f82c5b3Tamas Berghammer 586fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} // namespace internal 587fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} // namespace protobuf 588fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville} // namespace google 589