build_utf8_validator_tables.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Create a state machine for validating UTF-8. The algorithm in brief:
6// 1. Convert the complete unicode range of code points, except for the
7//    surrogate code points, to an ordered array of sequences of bytes in
8//    UTF-8.
9// 2. Convert individual bytes to ranges, starting from the right of each byte
10//    sequence. For each range, ensure the bytes on the left and the ranges
11//    on the right are the identical.
12// 3. Convert the resulting list of ranges into a state machine, collapsing
13//    identical states.
14// 4. Convert the state machine to an array of bytes.
15// 5. Output as a C++ file.
16//
17// To use:
18//  $ ninja -C out/Release build_utf8_validator_tables
19//  $ out/Release/build_utf8_validator_tables
20//                                   --output=base/i18n/utf8_validator_tables.cc
21//  $ git add base/i18n/utf8_validator_tables.cc
22//
23// Because the table is not expected to ever change, it is checked into the
24// repository rather than being regenerated at build time.
25//
26// This code uses type uint8 throughout to represent bytes, to avoid
27// signed/unsigned char confusion.
28
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33#include <algorithm>
34#include <map>
35#include <string>
36#include <vector>
37
38#include "base/basictypes.h"
39#include "base/command_line.h"
40#include "base/file_util.h"
41#include "base/files/file_path.h"
42#include "base/logging.h"
43#include "base/numerics/safe_conversions.h"
44#include "base/strings/stringprintf.h"
45#include "third_party/icu/source/common/unicode/utf8.h"
46
47namespace {
48
49const char kHelpText[] =
50    "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
51
52const char kProlog[] =
53    "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
54    "// Use of this source code is governed by a BSD-style license that can "
55    "be\n"
56    "// found in the LICENSE file.\n"
57    "\n"
58    "// This file is auto-generated by build_utf8_validator_tables.\n"
59    "// DO NOT EDIT.\n"
60    "\n"
61    "#include \"base/i18n/utf8_validator_tables.h\"\n"
62    "\n"
63    "namespace base {\n"
64    "namespace internal {\n"
65    "\n"
66    "const uint8 kUtf8ValidatorTables[] = {\n";
67
68const char kEpilog[] =
69    "};\n"
70    "\n"
71    "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
72    "\n"
73    "}  // namespace internal\n"
74    "}  // namespace base\n";
75
76// Ranges are inclusive at both ends--they represent [from, to]
77class Range {
78 public:
79  // Ranges always start with just one byte.
80  explicit Range(uint8 value) : from_(value), to_(value) {}
81
82  // Range objects are copyable and assignable to be used in STL
83  // containers. Since they only contain non-pointer POD types, the default copy
84  // constructor, assignment operator and destructor will work.
85
86  // Add a byte to the range. We intentionally only support adding a byte at the
87  // end, since that is the only operation the code needs.
88  void AddByte(uint8 to) {
89    CHECK(to == to_ + 1);
90    to_ = to;
91  }
92
93  uint8 from() const { return from_; }
94  uint8 to() const { return to_; }
95
96  bool operator<(const Range& rhs) const {
97    return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
98  }
99
100  bool operator==(const Range& rhs) const {
101    return from() == rhs.from() && to() == rhs.to();
102  }
103
104 private:
105  uint8 from_;
106  uint8 to_;
107};
108
109// A vector of Ranges is like a simple regular expression--it corresponds to
110// a set of strings of the same length that have bytes in each position in
111// the appropriate range.
112typedef std::vector<Range> StringSet;
113
114// A UTF-8 "character" is represented by a sequence of bytes.
115typedef std::vector<uint8> Character;
116
117// In the second stage of the algorithm, we want to convert a large list of
118// Characters into a small list of StringSets.
119struct Pair {
120  Character character;
121  StringSet set;
122};
123
124typedef std::vector<Pair> PairVector;
125
126// A class to print a table of numbers in the same style as clang-format.
127class TablePrinter {
128 public:
129  explicit TablePrinter(FILE* stream)
130      : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
131
132  void PrintValue(uint8 value) {
133    if (values_on_this_line_ == 0) {
134      fputs("   ", stream_);
135    } else if (values_on_this_line_ == kMaxValuesPerLine) {
136      fprintf(stream_, "  // 0x%02x\n   ", current_offset_);
137      values_on_this_line_ = 0;
138    }
139    fprintf(stream_, " 0x%02x,", static_cast<int>(value));
140    ++values_on_this_line_;
141    ++current_offset_;
142  }
143
144  void NewLine() {
145    while (values_on_this_line_ < kMaxValuesPerLine) {
146      fputs("      ", stream_);
147      ++values_on_this_line_;
148    }
149    fprintf(stream_, "  // 0x%02x\n", current_offset_);
150    values_on_this_line_ = 0;
151  }
152
153 private:
154  // stdio stream. Not owned.
155  FILE* stream_;
156
157  // Number of values so far printed on this line.
158  int values_on_this_line_;
159
160  // Total values printed so far.
161  int current_offset_;
162
163  static const int kMaxValuesPerLine = 8;
164
165  DISALLOW_COPY_AND_ASSIGN(TablePrinter);
166};
167
168// Start by filling a PairVector with characters. The resulting vector goes from
169// "\x00" to "\xf4\x8f\xbf\xbf".
170PairVector InitializeCharacters() {
171  PairVector vector;
172  for (int i = 0; i <= 0x10FFFF; ++i) {
173    if (i >= 0xD800 && i < 0xE000) {
174      // Surrogate codepoints are not permitted. Non-character code points are
175      // explicitly permitted.
176      continue;
177    }
178    uint8 bytes[4];
179    unsigned int offset = 0;
180    UBool is_error = false;
181    U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
182    DCHECK(!is_error);
183    DCHECK_GT(offset, 0u);
184    DCHECK_LE(offset, arraysize(bytes));
185    Pair pair = {Character(bytes, bytes + offset), StringSet()};
186    vector.push_back(pair);
187  }
188  return vector;
189}
190
191// Construct a new Pair from |character| and the concatenation of |new_range|
192// and |existing_set|, and append it to |pairs|.
193void ConstructPairAndAppend(const Character& character,
194                            const Range& new_range,
195                            const StringSet& existing_set,
196                            PairVector* pairs) {
197  Pair new_pair = {character, StringSet(1, new_range)};
198  new_pair.set.insert(
199      new_pair.set.end(), existing_set.begin(), existing_set.end());
200  pairs->push_back(new_pair);
201}
202
203// Each pass over the PairVector strips one byte off the right-hand-side of the
204// characters and adds a range to the set on the right. For example, the first
205// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
206// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
207// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
208// [\xa0-\xbf][\x80-\xbf]).
209void MoveRightMostCharToSet(PairVector* pairs) {
210  PairVector new_pairs;
211  PairVector::const_iterator it = pairs->begin();
212  while (it != pairs->end() && it->character.empty()) {
213    new_pairs.push_back(*it);
214    ++it;
215  }
216  CHECK(it != pairs->end());
217  Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
218  Range new_range(it->character.back());
219  StringSet converted = it->set;
220  ++it;
221  while (it != pairs->end()) {
222    const Pair& current_pair = *it++;
223    if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
224        std::equal(unconverted_bytes.begin(),
225                   unconverted_bytes.end(),
226                   current_pair.character.begin()) &&
227        converted == current_pair.set) {
228      // The particular set of UTF-8 codepoints we are validating guarantees
229      // that each byte range will be contiguous. This would not necessarily be
230      // true for an arbitrary set of UTF-8 codepoints.
231      DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
232      new_range.AddByte(current_pair.character.back());
233      continue;
234    }
235    ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
236    unconverted_bytes = Character(current_pair.character.begin(),
237                                  current_pair.character.end() - 1);
238    new_range = Range(current_pair.character.back());
239    converted = current_pair.set;
240  }
241  ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
242  new_pairs.swap(*pairs);
243}
244
245void MoveAllCharsToSets(PairVector* pairs) {
246  // Since each pass of the function moves one character, and UTF-8 sequences
247  // are at most 4 characters long, this simply runs the algorithm four times.
248  for (int i = 0; i < 4; ++i) {
249    MoveRightMostCharToSet(pairs);
250  }
251  if (DCHECK_IS_ON()) {
252    for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
253         ++it) {
254      DCHECK(it->character.empty());
255    }
256  }
257}
258
259// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
260// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
261// algorithm is working. Use the command-line option
262// --vmodule=build_utf8_validator_tables=1 to see this output.
263void LogStringSets(const PairVector& pairs) {
264  for (PairVector::const_iterator pair_it = pairs.begin();
265       pair_it != pairs.end();
266       ++pair_it) {
267    std::string set_as_string;
268    for (StringSet::const_iterator set_it = pair_it->set.begin();
269         set_it != pair_it->set.end();
270         ++set_it) {
271      set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
272                                          static_cast<int>(set_it->from()),
273                                          static_cast<int>(set_it->to()));
274    }
275    VLOG(1) << set_as_string;
276  }
277}
278
279// A single state in the state machine is represented by a sorted vector of
280// start bytes and target states. All input bytes in the range between the start
281// byte and the next entry in the vector (or 0xFF) result in a transition to the
282// target state.
283struct StateRange {
284  uint8 from;
285  uint8 target_state;
286};
287
288typedef std::vector<StateRange> State;
289
290// Generates a state where all bytes go to state 1 (invalid). This is also used
291// as an initialiser for other states (since bytes from outside the desired
292// range are invalid).
293State GenerateInvalidState() {
294  const StateRange range = {0, 1};
295  return State(1, range);
296}
297
298// A map from a state (ie. a set of strings which will match from this state) to
299// a number (which is an index into the array of states).
300typedef std::map<StringSet, uint8> StateMap;
301
302// Create a new state corresponding to |set|, add it |states| and |state_map|
303// and return the index it was given in |states|.
304uint8 MakeState(const StringSet& set,
305                std::vector<State>* states,
306                StateMap* state_map) {
307  DCHECK(!set.empty());
308  const Range& range = set.front();
309  const StringSet rest(set.begin() + 1, set.end());
310  const StateMap::const_iterator where = state_map->find(rest);
311  const uint8 target_state = where == state_map->end()
312                                 ? MakeState(rest, states, state_map)
313                                 : where->second;
314  DCHECK_LT(0, range.from());
315  DCHECK_LT(range.to(), 0xFF);
316  const StateRange new_state_initializer[] = {
317      {0, 1}, {range.from(), target_state}, {range.to() + 1, 1}};
318  states->push_back(
319      State(new_state_initializer,
320            new_state_initializer + arraysize(new_state_initializer)));
321  const uint8 new_state_number =
322      base::checked_cast<uint8>(states->size() - 1);
323  CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
324  return new_state_number;
325}
326
327std::vector<State> GenerateStates(const PairVector& pairs) {
328  // States 0 and 1 are the initial/valid state and invalid state, respectively.
329  std::vector<State> states(2, GenerateInvalidState());
330  StateMap state_map;
331  state_map.insert(std::make_pair(StringSet(), 0));
332  for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
333    DCHECK(it->character.empty());
334    DCHECK(!it->set.empty());
335    const Range& range = it->set.front();
336    const StringSet rest(it->set.begin() + 1, it->set.end());
337    const StateMap::const_iterator where = state_map.find(rest);
338    const uint8 target_state = where == state_map.end()
339                                   ? MakeState(rest, &states, &state_map)
340                                   : where->second;
341    if (states[0].back().from == range.from()) {
342      DCHECK_EQ(1, states[0].back().target_state);
343      states[0].back().target_state = target_state;
344      DCHECK_LT(range.to(), 0xFF);
345      const StateRange new_range = {range.to() + 1, 1};
346      states[0].push_back(new_range);
347    } else {
348      DCHECK_LT(range.to(), 0xFF);
349      const StateRange new_range_initializer[] = {{range.from(), target_state},
350                                                  {range.to() + 1, 1}};
351      states[0]
352          .insert(states[0].end(),
353                  new_range_initializer,
354                  new_range_initializer + arraysize(new_range_initializer));
355    }
356  }
357  return states;
358}
359
360// Output the generated states as a C++ table. Two tricks are used to compact
361// the table: each state in the table starts with a shift value which indicates
362// how many bits we can discard from the right-hand-side of the byte before
363// doing the table lookup. Secondly, only the state-transitions for bytes
364// with the top-bit set are included in the table; bytes without the top-bit set
365// are just ASCII and are handled directly by the code.
366void PrintStates(const std::vector<State>& states, FILE* stream) {
367  // First calculate the start-offset of each state. This allows the state
368  // machine to jump directly to the correct offset, avoiding an extra
369  // indirection. State 0 starts at offset 0.
370  std::vector<uint8> state_offset(1, 0);
371  std::vector<uint8> shifts;
372  uint8 pos = 0;
373
374  for (std::vector<State>::const_iterator state_it = states.begin();
375       state_it != states.end();
376       ++state_it) {
377    // We want to set |shift| to the (0-based) index of the least-significant
378    // set bit in any of the ranges for this state, since this tells us how many
379    // bits we can discard and still determine what range a byte lies in. Sadly
380    // it appears that ffs() is not portable, so we do it clumsily.
381    uint8 shift = 7;
382    for (State::const_iterator range_it = state_it->begin();
383         range_it != state_it->end();
384         ++range_it) {
385      while (shift > 0 && range_it->from % (1 << shift) != 0) {
386        --shift;
387      }
388    }
389    shifts.push_back(shift);
390    pos += 1 + (1 << (7 - shift));
391    state_offset.push_back(pos);
392  }
393
394  DCHECK_EQ(129, state_offset[1]);
395
396  fputs(kProlog, stream);
397  TablePrinter table_printer(stream);
398
399  for (uint8 state_index = 0; state_index < states.size(); ++state_index) {
400    const uint8 shift = shifts[state_index];
401    uint8 next_range = 0;
402    uint8 target_state = 1;
403    fprintf(stream,
404            "    // State %d, offset 0x%02x\n",
405            static_cast<int>(state_index),
406            static_cast<int>(state_offset[state_index]));
407    table_printer.PrintValue(shift);
408    for (int i = 0; i < 0x100; i += (1 << shift)) {
409      if (next_range < states[state_index].size() &&
410          states[state_index][next_range].from == i) {
411        target_state = states[state_index][next_range].target_state;
412        ++next_range;
413      }
414      if (i >= 0x80) {
415        table_printer.PrintValue(state_offset[target_state]);
416      }
417    }
418    table_printer.NewLine();
419  }
420
421  fputs(kEpilog, stream);
422}
423
424}  // namespace
425
426int main(int argc, char* argv[]) {
427  CommandLine::Init(argc, argv);
428  logging::LoggingSettings settings;
429  settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
430  logging::InitLogging(settings);
431  if (CommandLine::ForCurrentProcess()->HasSwitch("help")) {
432    fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
433    exit(EXIT_SUCCESS);
434  }
435  base::FilePath filename =
436      CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
437
438  FILE* output = stdout;
439  if (!filename.empty()) {
440    output = base::OpenFile(filename, "wb");
441    if (!output)
442      PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
443                  << "' for writing";
444  }
445
446  // Step 1: Enumerate the characters
447  PairVector pairs = InitializeCharacters();
448  // Step 2: Convert to sets.
449  MoveAllCharsToSets(&pairs);
450  if (VLOG_IS_ON(1)) {
451    LogStringSets(pairs);
452  }
453  // Step 3: Generate states.
454  std::vector<State> states = GenerateStates(pairs);
455  // Step 4/5: Print output
456  PrintStates(states, output);
457
458  if (!filename.empty()) {
459    if (!base::CloseFile(output))
460      PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
461                  << "'";
462  }
463
464  return EXIT_SUCCESS;
465}
466