15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved. 25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file. 45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Create a state machine for validating UTF-8. The algorithm in brief: 65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 1. Convert the complete unicode range of code points, except for the 75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// surrogate code points, to an ordered array of sequences of bytes in 85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// UTF-8. 95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 2. Convert individual bytes to ranges, starting from the right of each byte 105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// sequence. For each range, ensure the bytes on the left and the ranges 115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// on the right are the identical. 125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 3. Convert the resulting list of ranges into a state machine, collapsing 135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// identical states. 145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 4. Convert the state machine to an array of bytes. 155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 5. Output as a C++ file. 165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// To use: 185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// $ ninja -C out/Release build_utf8_validator_tables 195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// $ out/Release/build_utf8_validator_tables 205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// --output=base/i18n/utf8_validator_tables.cc 215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// $ git add base/i18n/utf8_validator_tables.cc 225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Because the table is not expected to ever change, it is checked into the 245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// repository rather than being regenerated at build time. 255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// This code uses type uint8 throughout to represent bytes, to avoid 275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// signed/unsigned char confusion. 285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <stdio.h> 305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <stdlib.h> 315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string.h> 325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <algorithm> 345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <map> 355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string> 365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <vector> 375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/basictypes.h" 395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/command_line.h" 405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/files/file_path.h" 416e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)#include "base/files/file_util.h" 425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/logging.h" 435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/numerics/safe_conversions.h" 445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/strings/stringprintf.h" 455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "third_party/icu/source/common/unicode/utf8.h" 465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace { 485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kHelpText[] = 505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; 515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kProlog[] = 535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "// Copyright 2013 The Chromium Authors. All rights reserved.\n" 545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "// Use of this source code is governed by a BSD-style license that can " 555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "be\n" 565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "// found in the LICENSE file.\n" 575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "// This file is auto-generated by build_utf8_validator_tables.\n" 595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "// DO NOT EDIT.\n" 605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "#include \"base/i18n/utf8_validator_tables.h\"\n" 625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "namespace base {\n" 645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "namespace internal {\n" 655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "const uint8 kUtf8ValidatorTables[] = {\n"; 675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)const char kEpilog[] = 695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "};\n" 705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" 725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "\n" 735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "} // namespace internal\n" 745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "} // namespace base\n"; 755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Ranges are inclusive at both ends--they represent [from, to] 775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class Range { 785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public: 795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Ranges always start with just one byte. 805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) explicit Range(uint8 value) : from_(value), to_(value) {} 815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Range objects are copyable and assignable to be used in STL 835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // containers. Since they only contain non-pointer POD types, the default copy 845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // constructor, assignment operator and destructor will work. 855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Add a byte to the range. We intentionally only support adding a byte at the 875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // end, since that is the only operation the code needs. 885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) void AddByte(uint8 to) { 895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CHECK(to == to_ + 1); 905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) to_ = to; 915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 from() const { return from_; } 945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 to() const { return to_; } 955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) bool operator<(const Range& rhs) const { 975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); 985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) bool operator==(const Range& rhs) const { 1015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return from() == rhs.from() && to() == rhs.to(); 1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private: 1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 from_; 1065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 to_; 1075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}; 1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A vector of Ranges is like a simple regular expression--it corresponds to 1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// a set of strings of the same length that have bytes in each position in 1115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// the appropriate range. 1125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<Range> StringSet; 1135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A UTF-8 "character" is represented by a sequence of bytes. 1155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<uint8> Character; 1165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// In the second stage of the algorithm, we want to convert a large list of 1185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Characters into a small list of StringSets. 1195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)struct Pair { 1205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) Character character; 1215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) StringSet set; 1225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}; 1235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<Pair> PairVector; 1255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A class to print a table of numbers in the same style as clang-format. 1275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class TablePrinter { 1285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public: 1295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) explicit TablePrinter(FILE* stream) 1305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) : stream_(stream), values_on_this_line_(0), current_offset_(0) {} 1315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) void PrintValue(uint8 value) { 1335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (values_on_this_line_ == 0) { 1345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fputs(" ", stream_); 1355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else if (values_on_this_line_ == kMaxValuesPerLine) { 1365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fprintf(stream_, " // 0x%02x\n ", current_offset_); 1375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) values_on_this_line_ = 0; 1385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fprintf(stream_, " 0x%02x,", static_cast<int>(value)); 1405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++values_on_this_line_; 1415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++current_offset_; 1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) void NewLine() { 1455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) while (values_on_this_line_ < kMaxValuesPerLine) { 1465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fputs(" ", stream_); 1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++values_on_this_line_; 1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fprintf(stream_, " // 0x%02x\n", current_offset_); 1505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) values_on_this_line_ = 0; 1515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private: 1545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // stdio stream. Not owned. 1555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) FILE* stream_; 1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Number of values so far printed on this line. 1585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int values_on_this_line_; 1595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Total values printed so far. 1615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int current_offset_; 1625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static const int kMaxValuesPerLine = 8; 1645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(TablePrinter); 1665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}; 1675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Start by filling a PairVector with characters. The resulting vector goes from 1695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// "\x00" to "\xf4\x8f\xbf\xbf". 1705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)PairVector InitializeCharacters() { 1715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PairVector vector; 1725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (int i = 0; i <= 0x10FFFF; ++i) { 1735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (i >= 0xD800 && i < 0xE000) { 1745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Surrogate codepoints are not permitted. Non-character code points are 1755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // explicitly permitted. 1765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) continue; 1775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 bytes[4]; 1795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) unsigned int offset = 0; 1805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) UBool is_error = false; 1815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); 1825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK(!is_error); 1835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_GT(offset, 0u); 1845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LE(offset, arraysize(bytes)); 1855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) Pair pair = {Character(bytes, bytes + offset), StringSet()}; 1865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) vector.push_back(pair); 1875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return vector; 1895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 1905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Construct a new Pair from |character| and the concatenation of |new_range| 1925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// and |existing_set|, and append it to |pairs|. 1935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void ConstructPairAndAppend(const Character& character, 1945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const Range& new_range, 1955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StringSet& existing_set, 1965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PairVector* pairs) { 1975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) Pair new_pair = {character, StringSet(1, new_range)}; 1985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_pair.set.insert( 1995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_pair.set.end(), existing_set.begin(), existing_set.end()); 2005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) pairs->push_back(new_pair); 2015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 2025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Each pass over the PairVector strips one byte off the right-hand-side of the 2045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// characters and adds a range to the set on the right. For example, the first 2055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", 2065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", 2075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", 2085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\xa0-\xbf][\x80-\xbf]). 2095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void MoveRightMostCharToSet(PairVector* pairs) { 2105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PairVector new_pairs; 2115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PairVector::const_iterator it = pairs->begin(); 2125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) while (it != pairs->end() && it->character.empty()) { 2135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_pairs.push_back(*it); 2145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++it; 2155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 2165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CHECK(it != pairs->end()); 2175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) Character unconverted_bytes(it->character.begin(), it->character.end() - 1); 2185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) Range new_range(it->character.back()); 2195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) StringSet converted = it->set; 2205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++it; 2215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) while (it != pairs->end()) { 2225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const Pair& current_pair = *it++; 2235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (current_pair.character.size() == unconverted_bytes.size() + 1 && 2245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::equal(unconverted_bytes.begin(), 2255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) unconverted_bytes.end(), 2265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) current_pair.character.begin()) && 2275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) converted == current_pair.set) { 2285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // The particular set of UTF-8 codepoints we are validating guarantees 2295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // that each byte range will be contiguous. This would not necessarily be 2305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // true for an arbitrary set of UTF-8 codepoints. 2315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); 2325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_range.AddByte(current_pair.character.back()); 2335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) continue; 2345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 2355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 2365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) unconverted_bytes = Character(current_pair.character.begin(), 2375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) current_pair.character.end() - 1); 2385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_range = Range(current_pair.character.back()); 2395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) converted = current_pair.set; 2405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 2415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 2425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_pairs.swap(*pairs); 2435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 2445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void MoveAllCharsToSets(PairVector* pairs) { 2465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Since each pass of the function moves one character, and UTF-8 sequences 2475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // are at most 4 characters long, this simply runs the algorithm four times. 2485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (int i = 0; i < 4; ++i) { 2495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) MoveRightMostCharToSet(pairs); 2505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)#if DCHECK_IS_ON 252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); 253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ++it) { 254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) DCHECK(it->character.empty()); 2555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)#endif 2575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 2585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], 2605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the 2615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// algorithm is working. Use the command-line option 2625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// --vmodule=build_utf8_validator_tables=1 to see this output. 2635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void LogStringSets(const PairVector& pairs) { 2645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (PairVector::const_iterator pair_it = pairs.begin(); 2655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) pair_it != pairs.end(); 2665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++pair_it) { 2675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::string set_as_string; 2685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (StringSet::const_iterator set_it = pair_it->set.begin(); 2695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) set_it != pair_it->set.end(); 2705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++set_it) { 2715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", 2725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static_cast<int>(set_it->from()), 2735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static_cast<int>(set_it->to())); 2745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 2755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VLOG(1) << set_as_string; 2765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 2775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 2785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A single state in the state machine is represented by a sorted vector of 2805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// start bytes and target states. All input bytes in the range between the start 2815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// byte and the next entry in the vector (or 0xFF) result in a transition to the 2825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// target state. 2835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)struct StateRange { 2845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 from; 2855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 target_state; 2865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}; 2875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::vector<StateRange> State; 2895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Generates a state where all bytes go to state 1 (invalid). This is also used 2915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// as an initialiser for other states (since bytes from outside the desired 2925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// range are invalid). 2935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)State GenerateInvalidState() { 2945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StateRange range = {0, 1}; 2955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return State(1, range); 2965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 2975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A map from a state (ie. a set of strings which will match from this state) to 2995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// a number (which is an index into the array of states). 3005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)typedef std::map<StringSet, uint8> StateMap; 3015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Create a new state corresponding to |set|, add it |states| and |state_map| 3035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// and return the index it was given in |states|. 3045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)uint8 MakeState(const StringSet& set, 3055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::vector<State>* states, 3065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) StateMap* state_map) { 3075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK(!set.empty()); 3085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const Range& range = set.front(); 3095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StringSet rest(set.begin() + 1, set.end()); 3105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StateMap::const_iterator where = state_map->find(rest); 3115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 target_state = where == state_map->end() 3125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ? MakeState(rest, states, state_map) 3135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) : where->second; 3145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LT(0, range.from()); 3155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LT(range.to(), 0xFF); 3165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StateRange new_state_initializer[] = { 317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) {0, 1}, {range.from(), target_state}, 318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) {static_cast<uint8>(range.to() + 1), 1}}; 3195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) states->push_back( 3205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) State(new_state_initializer, 3215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_state_initializer + arraysize(new_state_initializer))); 3225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 new_state_number = 3235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::checked_cast<uint8>(states->size() - 1); 3245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); 3255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return new_state_number; 3265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 3275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)std::vector<State> GenerateStates(const PairVector& pairs) { 3295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // States 0 and 1 are the initial/valid state and invalid state, respectively. 3305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::vector<State> states(2, GenerateInvalidState()); 3315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) StateMap state_map; 3325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state_map.insert(std::make_pair(StringSet(), 0)); 3335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { 3345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK(it->character.empty()); 3355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK(!it->set.empty()); 3365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const Range& range = it->set.front(); 3375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StringSet rest(it->set.begin() + 1, it->set.end()); 3385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StateMap::const_iterator where = state_map.find(rest); 3395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 target_state = where == state_map.end() 3405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ? MakeState(rest, &states, &state_map) 3415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) : where->second; 3425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (states[0].back().from == range.from()) { 3435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_EQ(1, states[0].back().target_state); 3445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) states[0].back().target_state = target_state; 3455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LT(range.to(), 0xFF); 346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; 3475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) states[0].push_back(new_range); 3485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else { 3495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LT(range.to(), 0xFF); 3505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const StateRange new_range_initializer[] = {{range.from(), target_state}, 351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) {static_cast<uint8>(range.to() + 1), 1}}; 3525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) states[0] 3535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) .insert(states[0].end(), 3545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_range_initializer, 3555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) new_range_initializer + arraysize(new_range_initializer)); 3565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 3575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 3585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return states; 3595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 3605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Output the generated states as a C++ table. Two tricks are used to compact 3625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// the table: each state in the table starts with a shift value which indicates 3635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// how many bits we can discard from the right-hand-side of the byte before 3645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// doing the table lookup. Secondly, only the state-transitions for bytes 3655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// with the top-bit set are included in the table; bytes without the top-bit set 3665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// are just ASCII and are handled directly by the code. 3675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void PrintStates(const std::vector<State>& states, FILE* stream) { 3685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // First calculate the start-offset of each state. This allows the state 3695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // machine to jump directly to the correct offset, avoiding an extra 3705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // indirection. State 0 starts at offset 0. 3715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::vector<uint8> state_offset(1, 0); 3725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::vector<uint8> shifts; 3735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 pos = 0; 3745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (std::vector<State>::const_iterator state_it = states.begin(); 3765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state_it != states.end(); 3775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++state_it) { 3785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // We want to set |shift| to the (0-based) index of the least-significant 3795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // set bit in any of the ranges for this state, since this tells us how many 3805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // bits we can discard and still determine what range a byte lies in. Sadly 3815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // it appears that ffs() is not portable, so we do it clumsily. 3825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 shift = 7; 3835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (State::const_iterator range_it = state_it->begin(); 3845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) range_it != state_it->end(); 3855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++range_it) { 3865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) while (shift > 0 && range_it->from % (1 << shift) != 0) { 3875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) --shift; 3885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 3895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 3905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) shifts.push_back(shift); 3915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) pos += 1 + (1 << (7 - shift)); 3925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state_offset.push_back(pos); 3935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 3945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_EQ(129, state_offset[1]); 3965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 3975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fputs(kProlog, stream); 3985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) TablePrinter table_printer(stream); 3995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (uint8 state_index = 0; state_index < states.size(); ++state_index) { 4015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 shift = shifts[state_index]; 4025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 next_range = 0; 4035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 target_state = 1; 4045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fprintf(stream, 4055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) " // State %d, offset 0x%02x\n", 4065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static_cast<int>(state_index), 4075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static_cast<int>(state_offset[state_index])); 4085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) table_printer.PrintValue(shift); 4095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (int i = 0; i < 0x100; i += (1 << shift)) { 4105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (next_range < states[state_index].size() && 4115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) states[state_index][next_range].from == i) { 4125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) target_state = states[state_index][next_range].target_state; 4135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++next_range; 4145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (i >= 0x80) { 4165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) table_printer.PrintValue(state_offset[target_state]); 4175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) table_printer.NewLine(); 4205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fputs(kEpilog, stream); 4235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 4245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} // namespace 4265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)int main(int argc, char* argv[]) { 4285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CommandLine::Init(argc, argv); 4295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) logging::LoggingSettings settings; 4305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; 4315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) logging::InitLogging(settings); 4325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (CommandLine::ForCurrentProcess()->HasSwitch("help")) { 4335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) fwrite(kHelpText, 1, arraysize(kHelpText), stdout); 4345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) exit(EXIT_SUCCESS); 4355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::FilePath filename = 4375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); 4385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) FILE* output = stdout; 4405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (!filename.empty()) { 4415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) output = base::OpenFile(filename, "wb"); 4425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (!output) 4435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() 4445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) << "' for writing"; 4455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Step 1: Enumerate the characters 4485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PairVector pairs = InitializeCharacters(); 4495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Step 2: Convert to sets. 4505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) MoveAllCharsToSets(&pairs); 4515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (VLOG_IS_ON(1)) { 4525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) LogStringSets(pairs); 4535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Step 3: Generate states. 4555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) std::vector<State> states = GenerateStates(pairs); 4565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Step 4/5: Print output 4575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PrintStates(states, output); 4585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (!filename.empty()) { 4605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (!base::CloseFile(output)) 4615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() 4625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) << "'"; 4635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 4645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 4655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return EXIT_SUCCESS; 4665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 467