1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6
7#include "base/basictypes.h"
8
9// Return true if current Tbl pointer is within state0 range
10// Note that unsigned compare checks both ends of range simultaneously
11static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
12  const uint8* Tbl0 = &st->state_table[st->state0];
13  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
14}
15
16
17// Look up property of one UTF-8 character and advance over it
18// Return 0 if input length is zero
19// Return 0 and advance one byte if input is ill-formed
20uint8 UTF8GenericProperty(const UTF8PropObj* st,
21                          const uint8** src,
22                          int* srclen) {
23  if (*srclen <= 0) {
24    return 0;
25  }
26
27  const uint8* lsrc = *src;
28  const uint8* Tbl_0 = &st->state_table[st->state0];
29  const uint8* Tbl = Tbl_0;
30  int e;
31  int eshift = st->entry_shift;
32
33  // Short series of tests faster than switch, optimizes 7-bit ASCII
34  unsigned char c = lsrc[0];
35  if (static_cast<signed char>(c) >= 0) {           // one byte
36    e = Tbl[c];
37    *src += 1;
38    *srclen -= 1;
39  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
40    e = Tbl[c];
41    Tbl = &Tbl_0[e << eshift];
42    e = Tbl[lsrc[1]];
43    *src += 2;
44    *srclen -= 2;
45  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
46    e = Tbl[c];
47    Tbl = &Tbl_0[e << eshift];
48    e = Tbl[lsrc[1]];
49    Tbl = &Tbl_0[e << eshift];
50    e = Tbl[lsrc[2]];
51    *src += 3;
52    *srclen -= 3;
53  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
54    e = Tbl[c];
55    Tbl = &Tbl_0[e << eshift];
56    e = Tbl[lsrc[1]];
57    Tbl = &Tbl_0[e << eshift];
58    e = Tbl[lsrc[2]];
59    Tbl = &Tbl_0[e << eshift];
60    e = Tbl[lsrc[3]];
61    *src += 4;
62    *srclen -= 4;
63  } else {                                                // Ill-formed
64    e = 0;
65    *src += 1;
66    *srclen -= 1;
67  }
68  return e;
69}
70
71// BigOneByte versions are needed for tables > 240 states, but most
72// won't need the TwoByte versions.
73// Internally, to next-to-last offset is multiplied by 16 and the last
74// offset is relative instead of absolute.
75// Look up property of one UTF-8 character and advance over it
76// Return 0 if input length is zero
77// Return 0 and advance one byte if input is ill-formed
78uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
79                          const uint8** src,
80                          int* srclen) {
81  if (*srclen <= 0) {
82    return 0;
83  }
84
85  const uint8* lsrc = *src;
86  const uint8* Tbl_0 = &st->state_table[st->state0];
87  const uint8* Tbl = Tbl_0;
88  int e;
89  int eshift = st->entry_shift;
90
91  // Short series of tests faster than switch, optimizes 7-bit ASCII
92  unsigned char c = lsrc[0];
93  if (static_cast<signed char>(c) >= 0) {           // one byte
94    e = Tbl[c];
95    *src += 1;
96    *srclen -= 1;
97  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
98    e = Tbl[c];
99    Tbl = &Tbl_0[e << eshift];
100    e = Tbl[lsrc[1]];
101    *src += 2;
102    *srclen -= 2;
103  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
104    e = Tbl[c];
105    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
106    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
107    Tbl = &Tbl[e << eshift];          // Relative +/-
108    e = Tbl[lsrc[2]];
109    *src += 3;
110    *srclen -= 3;
111  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
112    e = Tbl[c];
113    Tbl = &Tbl_0[e << eshift];
114    e = Tbl[lsrc[1]];
115    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
116    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
117    Tbl = &Tbl[e << eshift];          // Relative +/-
118    e = Tbl[lsrc[3]];
119    *src += 4;
120    *srclen -= 4;
121  } else {                                                // Ill-formed
122    e = 0;
123    *src += 1;
124    *srclen -= 1;
125  }
126  return e;
127}
128
129// Scan a UTF-8 stringpiece based on a state table.
130// Always scan complete UTF-8 characters
131// Set number of bytes scanned. Return reason for exiting
132int UTF8GenericScan(const UTF8ScanObj* st,
133                    const uint8* str,
134                    const int len,
135                    int* bytes_consumed) {
136  int eshift = st->entry_shift;        // 6 (space optimized) or 8
137  // int nEntries = (1 << eshift);       // 64 or 256 entries per state
138
139  const uint8* isrc = str;
140    //reinterpret_cast<const uint8*>(str.data());
141  const uint8* src = isrc;
142  //const int len = str.length();
143  const uint8* srclimit = isrc + len;
144  const uint8* srclimit8 = srclimit - 7;
145  *bytes_consumed = 0;
146  if (len == 0) return kExitOK;
147
148  const uint8* Tbl_0 = &st->state_table[st->state0];
149
150DoAgain:
151  // Do state-table scan
152  int e = 0;
153  uint8 c;
154
155  // Do fast for groups of 8 identity bytes.
156  // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
157  // including slowing slightly on cr/lf/ht
158  //----------------------------
159  const uint8* Tbl2 = &st->fast_state[0];
160  uint32 losub = st->losub;
161  uint32 hiadd = st->hiadd;
162  while (src < srclimit8) {
163    uint32 s0123 = UnalignedLoad32(src);
164    uint32 s4567 = UnalignedLoad32(src + 4);
165    src += 8;
166    // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
167    uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
168                  (s4567 - losub) | (s4567 + hiadd);
169    if ((temp & 0x80808080) != 0) {
170      // We typically end up here on cr/lf/ht; src was incremented
171      int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
172                  (Tbl2[src[-6]] | Tbl2[src[-5]]);
173      if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
174      e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
175              (Tbl2[src[-2]] | Tbl2[src[-1]]);
176      if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
177      // Else OK, go around again
178    }
179  }
180  //----------------------------
181
182  // Byte-at-a-time scan
183  //----------------------------
184  const uint8* Tbl = Tbl_0;
185  while (src < srclimit) {
186    c = *src;
187    e = Tbl[c];
188    src++;
189    if (e >= kExitIllegalStructure) {break;}
190    Tbl = &Tbl_0[e << eshift];
191  }
192  //----------------------------
193
194
195  // Exit posibilities:
196  //  Some exit code, !state0, back up over last char
197  //  Some exit code, state0, back up one byte exactly
198  //  source consumed, !state0, back up over partial char
199  //  source consumed, state0, exit OK
200  // For illegal byte in state0, avoid backup up over PREVIOUS char
201  // For truncated last char, back up to beginning of it
202
203  if (e >= kExitIllegalStructure) {
204    // Back up over exactly one byte of rejected/illegal UTF-8 character
205    src--;
206    // Back up more if needed
207    if (!InStateZero(st, Tbl)) {
208      do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
209    }
210  } else if (!InStateZero(st, Tbl)) {
211    // Back up over truncated UTF-8 character
212    e = kExitIllegalStructure;
213    do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
214  } else {
215    // Normal termination, source fully consumed
216    e = kExitOK;
217  }
218
219  if (e == kExitDoAgain) {
220    // Loop back up to the fast scan
221    goto DoAgain;
222  }
223
224  *bytes_consumed = src - isrc;
225  return e;
226}
227