1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 6#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 7 8#if !defined(CLD_WINDOWS) 9 10#include "util/utf8/utf8statetable.h" 11 12#else 13 14#include "encodings/compact_lang_det/win/cld_basictypes.h" 15 16// These four-byte entries compactly encode how many bytes 0..255 to delete 17// in making a string replacement, how many bytes to add 0..255, and the offset 18// 0..64k-1 of the replacement string in remap_string. 19struct RemapEntry { 20 uint8 delete_bytes; 21 uint8 add_bytes; 22 uint16 bytes_offset; 23}; 24 25// Exit type codes for state tables. All but the first get stuffed into 26// signed one-byte entries. The first is only generated by executable code. 27// To distinguish from next-state entries, these must be contiguous and 28// all <= kExitNone 29typedef enum { 30 kExitDstSpaceFull = 239, 31 kExitIllegalStructure, // 240 32 kExitOK, // 241 33 kExitReject, // ... 34 kExitReplace1, 35 kExitReplace2, 36 kExitReplace3, 37 kExitReplace21, 38 kExitReplace31, 39 kExitReplace32, 40 kExitReplaceOffset1, 41 kExitReplaceOffset2, 42 kExitReplace1S0, 43 kExitSpecial, 44 kExitDoAgain, 45 kExitRejectAlt, 46 kExitNone // 255 47} ExitReason; 48 49typedef enum { 50 kExitDstSpaceFull_2 = -32769, 51 kExitIllegalStructure_2, // -32768 52 kExitOK_2, // -32767 53 kExitReject_2, // ... 54 kExitReplace1_2, 55 kExitReplace2_2, 56 kExitReplace3_2, 57 kExitReplace21_2, 58 kExitReplace31_2, 59 kExitReplace32_2, 60 kExitReplaceOffset1_2, 61 kExitReplaceOffset2_2, 62 kExitReplace1S0_2, 63 kExitSpecial_2, 64 kExitDoAgain_2, 65 kExitRejectAlt_2, 66 kExitNone_2 // -32753 67} ExitReason_2; 68 69// This struct represents one entire state table. The three initialized byte 70// areas are state_table, remap_base, and remap_string. state0 and state0_size 71// give the byte offset and length within state_table of the initial state -- 72// table lookups are expected to start and end in this state, but for 73// truncated UTF-8 strings, may end in a different state. These allow a quick 74// test for that condition. entry_shift is 8 for tables subscripted by a full 75// byte value and 6 for space-optimized tables subscripted by only six 76// significant bits in UTF-8 continuation bytes. 77typedef struct { 78 const uint32 state0; 79 const uint32 state0_size; 80 const uint32 total_size; 81 const int max_expand; 82 const int entry_shift; 83 const int bytes_per_entry; 84 const uint32 losub; 85 const uint32 hiadd; 86 const uint8* state_table; 87 const RemapEntry* remap_base; 88 const uint8* remap_string; 89 const uint8* fast_state; 90} UTF8StateMachineObj; 91 92// Near-duplicate declaration for tables with two-byte entries 93typedef struct { 94 const uint32 state0; 95 const uint32 state0_size; 96 const uint32 total_size; 97 const int max_expand; 98 const int entry_shift; 99 const int bytes_per_entry; 100 const uint32 losub; 101 const uint32 hiadd; 102 const signed short* state_table; 103 const RemapEntry* remap_base; 104 const uint8* remap_string; 105 const uint8* fast_state; 106} UTF8StateMachineObj_2; 107 108 109typedef UTF8StateMachineObj UTF8PropObj; 110typedef UTF8StateMachineObj UTF8ScanObj; 111typedef UTF8StateMachineObj_2 UTF8PropObj_2; 112 113 114// Look up property of one UTF-8 character and advance over it 115// Return 0 if input length is zero 116// Return 0 and advance one byte if input is ill-formed 117uint8 UTF8GenericProperty(const UTF8PropObj* st, 118 const uint8** src, 119 int* srclen); 120 121// BigOneByte versions are needed for tables > 240 states, but most 122// won't need the TwoByte versions. 123 124// Look up property of one UTF-8 character and advance over it 125// Return 0 if input length is zero 126// Return 0 and advance one byte if input is ill-formed 127uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, 128 const uint8** src, 129 int* srclen); 130 131// Scan a UTF-8 stringpiece based on a state table. 132// Always scan complete UTF-8 characters 133// Set number of bytes scanned. Return reason for exiting 134int UTF8GenericScan(const UTF8ScanObj* st, 135 const uint8* str, 136 const int len, 137 int* bytes_consumed); 138 139#endif 140 141#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 142