1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
6#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
7
8#if !defined(CLD_WINDOWS)
9
10#include "util/utf8/utf8statetable.h"
11
12#else
13
14#include "encodings/compact_lang_det/win/cld_basictypes.h"
15
16// These four-byte entries compactly encode how many bytes 0..255 to delete
17// in making a string replacement, how many bytes to add 0..255, and the offset
18// 0..64k-1 of the replacement string in remap_string.
19struct RemapEntry {
20  uint8 delete_bytes;
21  uint8 add_bytes;
22  uint16 bytes_offset;
23};
24
25// Exit type codes for state tables. All but the first get stuffed into
26// signed one-byte entries. The first is only generated by executable code.
27// To distinguish from next-state entries, these must be contiguous and
28// all <= kExitNone
29typedef enum {
30  kExitDstSpaceFull = 239,
31  kExitIllegalStructure,  // 240
32  kExitOK,                // 241
33  kExitReject,            // ...
34  kExitReplace1,
35  kExitReplace2,
36  kExitReplace3,
37  kExitReplace21,
38  kExitReplace31,
39  kExitReplace32,
40  kExitReplaceOffset1,
41  kExitReplaceOffset2,
42  kExitReplace1S0,
43  kExitSpecial,
44  kExitDoAgain,
45  kExitRejectAlt,
46  kExitNone               // 255
47} ExitReason;
48
49typedef enum {
50  kExitDstSpaceFull_2 = -32769,
51  kExitIllegalStructure_2,  // -32768
52  kExitOK_2,                // -32767
53  kExitReject_2,            // ...
54  kExitReplace1_2,
55  kExitReplace2_2,
56  kExitReplace3_2,
57  kExitReplace21_2,
58  kExitReplace31_2,
59  kExitReplace32_2,
60  kExitReplaceOffset1_2,
61  kExitReplaceOffset2_2,
62  kExitReplace1S0_2,
63  kExitSpecial_2,
64  kExitDoAgain_2,
65  kExitRejectAlt_2,
66  kExitNone_2               // -32753
67} ExitReason_2;
68
69// This struct represents one entire state table. The three initialized byte
70// areas are state_table, remap_base, and remap_string. state0 and state0_size
71// give the byte offset and length within state_table of the initial state --
72// table lookups are expected to start and end in this state, but for
73// truncated UTF-8 strings, may end in a different state. These allow a quick
74// test for that condition. entry_shift is 8 for tables subscripted by a full
75// byte value and 6 for space-optimized tables subscripted by only six
76// significant bits in UTF-8 continuation bytes.
77typedef struct {
78  const uint32 state0;
79  const uint32 state0_size;
80  const uint32 total_size;
81  const int max_expand;
82  const int entry_shift;
83  const int bytes_per_entry;
84  const uint32 losub;
85  const uint32 hiadd;
86  const uint8* state_table;
87  const RemapEntry* remap_base;
88  const uint8* remap_string;
89  const uint8* fast_state;
90} UTF8StateMachineObj;
91
92// Near-duplicate declaration for tables with two-byte entries
93typedef struct {
94  const uint32 state0;
95  const uint32 state0_size;
96  const uint32 total_size;
97  const int max_expand;
98  const int entry_shift;
99  const int bytes_per_entry;
100  const uint32 losub;
101  const uint32 hiadd;
102  const signed short* state_table;
103  const RemapEntry* remap_base;
104  const uint8* remap_string;
105  const uint8* fast_state;
106} UTF8StateMachineObj_2;
107
108
109typedef UTF8StateMachineObj UTF8PropObj;
110typedef UTF8StateMachineObj UTF8ScanObj;
111typedef UTF8StateMachineObj_2 UTF8PropObj_2;
112
113
114// Look up property of one UTF-8 character and advance over it
115// Return 0 if input length is zero
116// Return 0 and advance one byte if input is ill-formed
117uint8 UTF8GenericProperty(const UTF8PropObj* st,
118                          const uint8** src,
119                          int* srclen);
120
121// BigOneByte versions are needed for tables > 240 states, but most
122// won't need the TwoByte versions.
123
124// Look up property of one UTF-8 character and advance over it
125// Return 0 if input length is zero
126// Return 0 and advance one byte if input is ill-formed
127uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
128                          const uint8** src,
129                          int* srclen);
130
131// Scan a UTF-8 stringpiece based on a state table.
132// Always scan complete UTF-8 characters
133// Set number of bytes scanned. Return reason for exiting
134int UTF8GenericScan(const UTF8ScanObj* st,
135                    const uint8* str,
136                    const int len,
137                    int* bytes_consumed);
138
139#endif
140
141#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
142