15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file.
45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Create a state machine for validating UTF-8. The algorithm in brief:
65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 1. Convert the complete unicode range of code points, except for the
75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//    surrogate code points, to an ordered array of sequences of bytes in
85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//    UTF-8.
95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 2. Convert individual bytes to ranges, starting from the right of each byte
105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//    sequence. For each range, ensure the bytes on the left and the ranges
115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//    on the right are the identical.
125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 3. Convert the resulting list of ranges into a state machine, collapsing
135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//    identical states.
145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 4. Convert the state machine to an array of bytes.
155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 5. Output as a C++ file.
165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// To use:
185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//  $ ninja -C out/Release build_utf8_validator_tables
195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//  $ out/Release/build_utf8_validator_tables
205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//                                   --output=base/i18n/utf8_validator_tables.cc
215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//  $ git add base/i18n/utf8_validator_tables.cc
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Because the table is not expected to ever change, it is checked into the
245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// repository rather than being regenerated at build time.
255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// This code uses type uint8 throughout to represent bytes, to avoid
275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// signed/unsigned char confusion.
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <stdio.h>
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <stdlib.h>
315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string.h>
325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <algorithm>
345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <map>
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string>
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <vector>
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/basictypes.h"
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/command_line.h"
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/files/file_path.h"
416e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)#include "base/files/file_util.h"
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/logging.h"
435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/numerics/safe_conversions.h"
445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/strings/stringprintf.h"
455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "third_party/icu/source/common/unicode/utf8.h"
465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace {
485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kHelpText[] =
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kProlog[] =
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "// Use of this source code is governed by a BSD-style license that can "
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "be\n"
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "// found in the LICENSE file.\n"
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "// This file is auto-generated by build_utf8_validator_tables.\n"
595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "// DO NOT EDIT.\n"
605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "#include \"base/i18n/utf8_validator_tables.h\"\n"
625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "namespace base {\n"
645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "namespace internal {\n"
655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "const uint8 kUtf8ValidatorTables[] = {\n";
675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kEpilog[] =
695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "};\n"
705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "\n"
735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "}  // namespace internal\n"
745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "}  // namespace base\n";
755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Ranges are inclusive at both ends--they represent [from, to]
775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class Range {
785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public:
795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Ranges always start with just one byte.
805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  explicit Range(uint8 value) : from_(value), to_(value) {}
815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Range objects are copyable and assignable to be used in STL
835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // containers. Since they only contain non-pointer POD types, the default copy
845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // constructor, assignment operator and destructor will work.
855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Add a byte to the range. We intentionally only support adding a byte at the
875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // end, since that is the only operation the code needs.
885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  void AddByte(uint8 to) {
895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    CHECK(to == to_ + 1);
905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    to_ = to;
915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 from() const { return from_; }
945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 to() const { return to_; }
955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  bool operator<(const Range& rhs) const {
975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  bool operator==(const Range& rhs) const {
1015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return from() == rhs.from() && to() == rhs.to();
1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
1035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private:
1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 from_;
1065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 to_;
1075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)};
1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A vector of Ranges is like a simple regular expression--it corresponds to
1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// a set of strings of the same length that have bytes in each position in
1115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// the appropriate range.
1125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<Range> StringSet;
1135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A UTF-8 "character" is represented by a sequence of bytes.
1155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<uint8> Character;
1165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// In the second stage of the algorithm, we want to convert a large list of
1185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Characters into a small list of StringSets.
1195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)struct Pair {
1205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  Character character;
1215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  StringSet set;
1225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)};
1235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<Pair> PairVector;
1255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A class to print a table of numbers in the same style as clang-format.
1275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class TablePrinter {
1285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public:
1295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  explicit TablePrinter(FILE* stream)
1305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
1315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  void PrintValue(uint8 value) {
1335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (values_on_this_line_ == 0) {
1345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      fputs("   ", stream_);
1355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    } else if (values_on_this_line_ == kMaxValuesPerLine) {
1365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      fprintf(stream_, "  // 0x%02x\n   ", current_offset_);
1375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      values_on_this_line_ = 0;
1385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    fprintf(stream_, " 0x%02x,", static_cast<int>(value));
1405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    ++values_on_this_line_;
1415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    ++current_offset_;
1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  void NewLine() {
1455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    while (values_on_this_line_ < kMaxValuesPerLine) {
1465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      fputs("      ", stream_);
1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ++values_on_this_line_;
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    fprintf(stream_, "  // 0x%02x\n", current_offset_);
1505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    values_on_this_line_ = 0;
1515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
1525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private:
1545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // stdio stream. Not owned.
1555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  FILE* stream_;
1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Number of values so far printed on this line.
1585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  int values_on_this_line_;
1595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Total values printed so far.
1615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  int current_offset_;
1625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  static const int kMaxValuesPerLine = 8;
1645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(TablePrinter);
1665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)};
1675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Start by filling a PairVector with characters. The resulting vector goes from
1695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// "\x00" to "\xf4\x8f\xbf\xbf".
1705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)PairVector InitializeCharacters() {
1715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  PairVector vector;
1725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (int i = 0; i <= 0x10FFFF; ++i) {
1735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (i >= 0xD800 && i < 0xE000) {
1745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // Surrogate codepoints are not permitted. Non-character code points are
1755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // explicitly permitted.
1765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      continue;
1775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    uint8 bytes[4];
1795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    unsigned int offset = 0;
1805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    UBool is_error = false;
1815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
1825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK(!is_error);
1835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK_GT(offset, 0u);
1845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK_LE(offset, arraysize(bytes));
1855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    Pair pair = {Character(bytes, bytes + offset), StringSet()};
1865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    vector.push_back(pair);
1875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
1885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return vector;
1895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
1905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Construct a new Pair from |character| and the concatenation of |new_range|
1925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// and |existing_set|, and append it to |pairs|.
1935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void ConstructPairAndAppend(const Character& character,
1945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                            const Range& new_range,
1955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                            const StringSet& existing_set,
1965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                            PairVector* pairs) {
1975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  Pair new_pair = {character, StringSet(1, new_range)};
1985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  new_pair.set.insert(
1995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      new_pair.set.end(), existing_set.begin(), existing_set.end());
2005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  pairs->push_back(new_pair);
2015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Each pass over the PairVector strips one byte off the right-hand-side of the
2045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// characters and adds a range to the set on the right. For example, the first
2055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
2065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
2075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
2085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\xa0-\xbf][\x80-\xbf]).
2095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void MoveRightMostCharToSet(PairVector* pairs) {
2105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  PairVector new_pairs;
2115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  PairVector::const_iterator it = pairs->begin();
2125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  while (it != pairs->end() && it->character.empty()) {
2135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    new_pairs.push_back(*it);
2145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    ++it;
2155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
2165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  CHECK(it != pairs->end());
2175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
2185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  Range new_range(it->character.back());
2195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  StringSet converted = it->set;
2205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  ++it;
2215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  while (it != pairs->end()) {
2225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const Pair& current_pair = *it++;
2235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
2245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        std::equal(unconverted_bytes.begin(),
2255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                   unconverted_bytes.end(),
2265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                   current_pair.character.begin()) &&
2275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        converted == current_pair.set) {
2285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // The particular set of UTF-8 codepoints we are validating guarantees
2295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // that each byte range will be contiguous. This would not necessarily be
2305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // true for an arbitrary set of UTF-8 codepoints.
2315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
2325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      new_range.AddByte(current_pair.character.back());
2335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      continue;
2345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
2355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
2365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    unconverted_bytes = Character(current_pair.character.begin(),
2375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                  current_pair.character.end() - 1);
2385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    new_range = Range(current_pair.character.back());
2395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    converted = current_pair.set;
2405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
2415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
2425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  new_pairs.swap(*pairs);
2435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void MoveAllCharsToSets(PairVector* pairs) {
2465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Since each pass of the function moves one character, and UTF-8 sequences
2475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // are at most 4 characters long, this simply runs the algorithm four times.
2485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (int i = 0; i < 4; ++i) {
2495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    MoveRightMostCharToSet(pairs);
2505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)#if DCHECK_IS_ON
252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)       ++it) {
254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    DCHECK(it->character.empty());
2555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)#endif
2575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
2605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
2615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// algorithm is working. Use the command-line option
2625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// --vmodule=build_utf8_validator_tables=1 to see this output.
2635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void LogStringSets(const PairVector& pairs) {
2645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (PairVector::const_iterator pair_it = pairs.begin();
2655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)       pair_it != pairs.end();
2665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)       ++pair_it) {
2675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    std::string set_as_string;
2685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for (StringSet::const_iterator set_it = pair_it->set.begin();
2695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         set_it != pair_it->set.end();
2705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         ++set_it) {
2715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
2725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                          static_cast<int>(set_it->from()),
2735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                          static_cast<int>(set_it->to()));
2745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
2755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    VLOG(1) << set_as_string;
2765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
2775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A single state in the state machine is represented by a sorted vector of
2805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// start bytes and target states. All input bytes in the range between the start
2815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// byte and the next entry in the vector (or 0xFF) result in a transition to the
2825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// target state.
2835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)struct StateRange {
2845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 from;
2855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 target_state;
2865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)};
2875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<StateRange> State;
2895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Generates a state where all bytes go to state 1 (invalid). This is also used
2915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// as an initialiser for other states (since bytes from outside the desired
2925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// range are invalid).
2935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)State GenerateInvalidState() {
2945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const StateRange range = {0, 1};
2955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return State(1, range);
2965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A map from a state (ie. a set of strings which will match from this state) to
2995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// a number (which is an index into the array of states).
3005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::map<StringSet, uint8> StateMap;
3015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Create a new state corresponding to |set|, add it |states| and |state_map|
3035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// and return the index it was given in |states|.
3045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)uint8 MakeState(const StringSet& set,
3055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                std::vector<State>* states,
3065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                StateMap* state_map) {
3075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DCHECK(!set.empty());
3085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const Range& range = set.front();
3095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const StringSet rest(set.begin() + 1, set.end());
3105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const StateMap::const_iterator where = state_map->find(rest);
3115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const uint8 target_state = where == state_map->end()
3125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                 ? MakeState(rest, states, state_map)
3135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                 : where->second;
3145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DCHECK_LT(0, range.from());
3155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DCHECK_LT(range.to(), 0xFF);
3165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const StateRange new_state_initializer[] = {
317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)      {0, 1}, {range.from(), target_state},
318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)      {static_cast<uint8>(range.to() + 1), 1}};
3195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  states->push_back(
3205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      State(new_state_initializer,
3215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            new_state_initializer + arraysize(new_state_initializer)));
3225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const uint8 new_state_number =
3235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      base::checked_cast<uint8>(states->size() - 1);
3245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
3255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return new_state_number;
3265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
3275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)std::vector<State> GenerateStates(const PairVector& pairs) {
3295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // States 0 and 1 are the initial/valid state and invalid state, respectively.
3305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  std::vector<State> states(2, GenerateInvalidState());
3315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  StateMap state_map;
3325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  state_map.insert(std::make_pair(StringSet(), 0));
3335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
3345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK(it->character.empty());
3355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK(!it->set.empty());
3365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const Range& range = it->set.front();
3375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const StringSet rest(it->set.begin() + 1, it->set.end());
3385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const StateMap::const_iterator where = state_map.find(rest);
3395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const uint8 target_state = where == state_map.end()
3405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                   ? MakeState(rest, &states, &state_map)
3415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                   : where->second;
3425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (states[0].back().from == range.from()) {
3435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      DCHECK_EQ(1, states[0].back().target_state);
3445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      states[0].back().target_state = target_state;
3455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      DCHECK_LT(range.to(), 0xFF);
346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)      const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};
3475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      states[0].push_back(new_range);
3485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    } else {
3495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      DCHECK_LT(range.to(), 0xFF);
3505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      const StateRange new_range_initializer[] = {{range.from(), target_state},
351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)           {static_cast<uint8>(range.to() + 1), 1}};
3525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      states[0]
3535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          .insert(states[0].end(),
3545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                  new_range_initializer,
3555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                  new_range_initializer + arraysize(new_range_initializer));
3565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
3575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
3585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return states;
3595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
3605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Output the generated states as a C++ table. Two tricks are used to compact
3625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// the table: each state in the table starts with a shift value which indicates
3635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// how many bits we can discard from the right-hand-side of the byte before
3645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// doing the table lookup. Secondly, only the state-transitions for bytes
3655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// with the top-bit set are included in the table; bytes without the top-bit set
3665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// are just ASCII and are handled directly by the code.
3675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void PrintStates(const std::vector<State>& states, FILE* stream) {
3685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // First calculate the start-offset of each state. This allows the state
3695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // machine to jump directly to the correct offset, avoiding an extra
3705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // indirection. State 0 starts at offset 0.
3715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  std::vector<uint8> state_offset(1, 0);
3725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  std::vector<uint8> shifts;
3735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 pos = 0;
3745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (std::vector<State>::const_iterator state_it = states.begin();
3765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)       state_it != states.end();
3775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)       ++state_it) {
3785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // We want to set |shift| to the (0-based) index of the least-significant
3795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // set bit in any of the ranges for this state, since this tells us how many
3805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // bits we can discard and still determine what range a byte lies in. Sadly
3815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // it appears that ffs() is not portable, so we do it clumsily.
3825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    uint8 shift = 7;
3835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for (State::const_iterator range_it = state_it->begin();
3845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         range_it != state_it->end();
3855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         ++range_it) {
3865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      while (shift > 0 && range_it->from % (1 << shift) != 0) {
3875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        --shift;
3885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      }
3895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
3905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    shifts.push_back(shift);
3915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    pos += 1 + (1 << (7 - shift));
3925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    state_offset.push_back(pos);
3935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
3945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DCHECK_EQ(129, state_offset[1]);
3965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  fputs(kProlog, stream);
3985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  TablePrinter table_printer(stream);
3995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (uint8 state_index = 0; state_index < states.size(); ++state_index) {
4015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const uint8 shift = shifts[state_index];
4025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    uint8 next_range = 0;
4035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    uint8 target_state = 1;
4045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    fprintf(stream,
4055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            "    // State %d, offset 0x%02x\n",
4065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            static_cast<int>(state_index),
4075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            static_cast<int>(state_offset[state_index]));
4085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    table_printer.PrintValue(shift);
4095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for (int i = 0; i < 0x100; i += (1 << shift)) {
4105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if (next_range < states[state_index].size() &&
4115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          states[state_index][next_range].from == i) {
4125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        target_state = states[state_index][next_range].target_state;
4135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        ++next_range;
4145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      }
4155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if (i >= 0x80) {
4165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        table_printer.PrintValue(state_offset[target_state]);
4175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      }
4185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
4195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    table_printer.NewLine();
4205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
4215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  fputs(kEpilog, stream);
4235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
4245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}  // namespace
4265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)int main(int argc, char* argv[]) {
4285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  CommandLine::Init(argc, argv);
4295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  logging::LoggingSettings settings;
4305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
4315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  logging::InitLogging(settings);
4325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  if (CommandLine::ForCurrentProcess()->HasSwitch("help")) {
4335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
4345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    exit(EXIT_SUCCESS);
4355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
4365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  base::FilePath filename =
4375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
4385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  FILE* output = stdout;
4405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  if (!filename.empty()) {
4415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    output = base::OpenFile(filename, "wb");
4425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (!output)
4435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
4445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                  << "' for writing";
4455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
4465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Step 1: Enumerate the characters
4485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  PairVector pairs = InitializeCharacters();
4495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Step 2: Convert to sets.
4505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  MoveAllCharsToSets(&pairs);
4515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  if (VLOG_IS_ON(1)) {
4525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    LogStringSets(pairs);
4535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
4545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Step 3: Generate states.
4555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  std::vector<State> states = GenerateStates(pairs);
4565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Step 4/5: Print output
4575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  PrintStates(states, output);
4585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  if (!filename.empty()) {
4605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if (!base::CloseFile(output))
4615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
4625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                  << "'";
4635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
4645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
4655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return EXIT_SUCCESS;
4665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
467