1// Copyright 2014 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Create a state machine for validating UTF-8. The algorithm in brief: 6// 1. Convert the complete unicode range of code points, except for the 7// surrogate code points, to an ordered array of sequences of bytes in 8// UTF-8. 9// 2. Convert individual bytes to ranges, starting from the right of each byte 10// sequence. For each range, ensure the bytes on the left and the ranges 11// on the right are the identical. 12// 3. Convert the resulting list of ranges into a state machine, collapsing 13// identical states. 14// 4. Convert the state machine to an array of bytes. 15// 5. Output as a C++ file. 16// 17// To use: 18// $ ninja -C out/Release build_utf8_validator_tables 19// $ out/Release/build_utf8_validator_tables 20// --output=base/i18n/utf8_validator_tables.cc 21// $ git add base/i18n/utf8_validator_tables.cc 22// 23// Because the table is not expected to ever change, it is checked into the 24// repository rather than being regenerated at build time. 25// 26// This code uses type uint8 throughout to represent bytes, to avoid 27// signed/unsigned char confusion. 28 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32 33#include <algorithm> 34#include <map> 35#include <string> 36#include <vector> 37 38#include "base/basictypes.h" 39#include "base/command_line.h" 40#include "base/files/file_path.h" 41#include "base/files/file_util.h" 42#include "base/logging.h" 43#include "base/numerics/safe_conversions.h" 44#include "base/strings/stringprintf.h" 45#include "third_party/icu/source/common/unicode/utf8.h" 46 47namespace { 48 49const char kHelpText[] = 50 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; 51 52const char kProlog[] = 53 "// Copyright 2013 The Chromium Authors. All rights reserved.\n" 54 "// Use of this source code is governed by a BSD-style license that can " 55 "be\n" 56 "// found in the LICENSE file.\n" 57 "\n" 58 "// This file is auto-generated by build_utf8_validator_tables.\n" 59 "// DO NOT EDIT.\n" 60 "\n" 61 "#include \"base/i18n/utf8_validator_tables.h\"\n" 62 "\n" 63 "namespace base {\n" 64 "namespace internal {\n" 65 "\n" 66 "const uint8 kUtf8ValidatorTables[] = {\n"; 67 68const char kEpilog[] = 69 "};\n" 70 "\n" 71 "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" 72 "\n" 73 "} // namespace internal\n" 74 "} // namespace base\n"; 75 76// Ranges are inclusive at both ends--they represent [from, to] 77class Range { 78 public: 79 // Ranges always start with just one byte. 80 explicit Range(uint8 value) : from_(value), to_(value) {} 81 82 // Range objects are copyable and assignable to be used in STL 83 // containers. Since they only contain non-pointer POD types, the default copy 84 // constructor, assignment operator and destructor will work. 85 86 // Add a byte to the range. We intentionally only support adding a byte at the 87 // end, since that is the only operation the code needs. 88 void AddByte(uint8 to) { 89 CHECK(to == to_ + 1); 90 to_ = to; 91 } 92 93 uint8 from() const { return from_; } 94 uint8 to() const { return to_; } 95 96 bool operator<(const Range& rhs) const { 97 return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); 98 } 99 100 bool operator==(const Range& rhs) const { 101 return from() == rhs.from() && to() == rhs.to(); 102 } 103 104 private: 105 uint8 from_; 106 uint8 to_; 107}; 108 109// A vector of Ranges is like a simple regular expression--it corresponds to 110// a set of strings of the same length that have bytes in each position in 111// the appropriate range. 112typedef std::vector<Range> StringSet; 113 114// A UTF-8 "character" is represented by a sequence of bytes. 115typedef std::vector<uint8> Character; 116 117// In the second stage of the algorithm, we want to convert a large list of 118// Characters into a small list of StringSets. 119struct Pair { 120 Character character; 121 StringSet set; 122}; 123 124typedef std::vector<Pair> PairVector; 125 126// A class to print a table of numbers in the same style as clang-format. 127class TablePrinter { 128 public: 129 explicit TablePrinter(FILE* stream) 130 : stream_(stream), values_on_this_line_(0), current_offset_(0) {} 131 132 void PrintValue(uint8 value) { 133 if (values_on_this_line_ == 0) { 134 fputs(" ", stream_); 135 } else if (values_on_this_line_ == kMaxValuesPerLine) { 136 fprintf(stream_, " // 0x%02x\n ", current_offset_); 137 values_on_this_line_ = 0; 138 } 139 fprintf(stream_, " 0x%02x,", static_cast<int>(value)); 140 ++values_on_this_line_; 141 ++current_offset_; 142 } 143 144 void NewLine() { 145 while (values_on_this_line_ < kMaxValuesPerLine) { 146 fputs(" ", stream_); 147 ++values_on_this_line_; 148 } 149 fprintf(stream_, " // 0x%02x\n", current_offset_); 150 values_on_this_line_ = 0; 151 } 152 153 private: 154 // stdio stream. Not owned. 155 FILE* stream_; 156 157 // Number of values so far printed on this line. 158 int values_on_this_line_; 159 160 // Total values printed so far. 161 int current_offset_; 162 163 static const int kMaxValuesPerLine = 8; 164 165 DISALLOW_COPY_AND_ASSIGN(TablePrinter); 166}; 167 168// Start by filling a PairVector with characters. The resulting vector goes from 169// "\x00" to "\xf4\x8f\xbf\xbf". 170PairVector InitializeCharacters() { 171 PairVector vector; 172 for (int i = 0; i <= 0x10FFFF; ++i) { 173 if (i >= 0xD800 && i < 0xE000) { 174 // Surrogate codepoints are not permitted. Non-character code points are 175 // explicitly permitted. 176 continue; 177 } 178 uint8 bytes[4]; 179 unsigned int offset = 0; 180 UBool is_error = false; 181 U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); 182 DCHECK(!is_error); 183 DCHECK_GT(offset, 0u); 184 DCHECK_LE(offset, arraysize(bytes)); 185 Pair pair = {Character(bytes, bytes + offset), StringSet()}; 186 vector.push_back(pair); 187 } 188 return vector; 189} 190 191// Construct a new Pair from |character| and the concatenation of |new_range| 192// and |existing_set|, and append it to |pairs|. 193void ConstructPairAndAppend(const Character& character, 194 const Range& new_range, 195 const StringSet& existing_set, 196 PairVector* pairs) { 197 Pair new_pair = {character, StringSet(1, new_range)}; 198 new_pair.set.insert( 199 new_pair.set.end(), existing_set.begin(), existing_set.end()); 200 pairs->push_back(new_pair); 201} 202 203// Each pass over the PairVector strips one byte off the right-hand-side of the 204// characters and adds a range to the set on the right. For example, the first 205// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", 206// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", 207// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", 208// [\xa0-\xbf][\x80-\xbf]). 209void MoveRightMostCharToSet(PairVector* pairs) { 210 PairVector new_pairs; 211 PairVector::const_iterator it = pairs->begin(); 212 while (it != pairs->end() && it->character.empty()) { 213 new_pairs.push_back(*it); 214 ++it; 215 } 216 CHECK(it != pairs->end()); 217 Character unconverted_bytes(it->character.begin(), it->character.end() - 1); 218 Range new_range(it->character.back()); 219 StringSet converted = it->set; 220 ++it; 221 while (it != pairs->end()) { 222 const Pair& current_pair = *it++; 223 if (current_pair.character.size() == unconverted_bytes.size() + 1 && 224 std::equal(unconverted_bytes.begin(), 225 unconverted_bytes.end(), 226 current_pair.character.begin()) && 227 converted == current_pair.set) { 228 // The particular set of UTF-8 codepoints we are validating guarantees 229 // that each byte range will be contiguous. This would not necessarily be 230 // true for an arbitrary set of UTF-8 codepoints. 231 DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); 232 new_range.AddByte(current_pair.character.back()); 233 continue; 234 } 235 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 236 unconverted_bytes = Character(current_pair.character.begin(), 237 current_pair.character.end() - 1); 238 new_range = Range(current_pair.character.back()); 239 converted = current_pair.set; 240 } 241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 242 new_pairs.swap(*pairs); 243} 244 245void MoveAllCharsToSets(PairVector* pairs) { 246 // Since each pass of the function moves one character, and UTF-8 sequences 247 // are at most 4 characters long, this simply runs the algorithm four times. 248 for (int i = 0; i < 4; ++i) { 249 MoveRightMostCharToSet(pairs); 250 } 251#if DCHECK_IS_ON 252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); 253 ++it) { 254 DCHECK(it->character.empty()); 255 } 256#endif 257} 258 259// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], 260// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the 261// algorithm is working. Use the command-line option 262// --vmodule=build_utf8_validator_tables=1 to see this output. 263void LogStringSets(const PairVector& pairs) { 264 for (PairVector::const_iterator pair_it = pairs.begin(); 265 pair_it != pairs.end(); 266 ++pair_it) { 267 std::string set_as_string; 268 for (StringSet::const_iterator set_it = pair_it->set.begin(); 269 set_it != pair_it->set.end(); 270 ++set_it) { 271 set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", 272 static_cast<int>(set_it->from()), 273 static_cast<int>(set_it->to())); 274 } 275 VLOG(1) << set_as_string; 276 } 277} 278 279// A single state in the state machine is represented by a sorted vector of 280// start bytes and target states. All input bytes in the range between the start 281// byte and the next entry in the vector (or 0xFF) result in a transition to the 282// target state. 283struct StateRange { 284 uint8 from; 285 uint8 target_state; 286}; 287 288typedef std::vector<StateRange> State; 289 290// Generates a state where all bytes go to state 1 (invalid). This is also used 291// as an initialiser for other states (since bytes from outside the desired 292// range are invalid). 293State GenerateInvalidState() { 294 const StateRange range = {0, 1}; 295 return State(1, range); 296} 297 298// A map from a state (ie. a set of strings which will match from this state) to 299// a number (which is an index into the array of states). 300typedef std::map<StringSet, uint8> StateMap; 301 302// Create a new state corresponding to |set|, add it |states| and |state_map| 303// and return the index it was given in |states|. 304uint8 MakeState(const StringSet& set, 305 std::vector<State>* states, 306 StateMap* state_map) { 307 DCHECK(!set.empty()); 308 const Range& range = set.front(); 309 const StringSet rest(set.begin() + 1, set.end()); 310 const StateMap::const_iterator where = state_map->find(rest); 311 const uint8 target_state = where == state_map->end() 312 ? MakeState(rest, states, state_map) 313 : where->second; 314 DCHECK_LT(0, range.from()); 315 DCHECK_LT(range.to(), 0xFF); 316 const StateRange new_state_initializer[] = { 317 {0, 1}, {range.from(), target_state}, 318 {static_cast<uint8>(range.to() + 1), 1}}; 319 states->push_back( 320 State(new_state_initializer, 321 new_state_initializer + arraysize(new_state_initializer))); 322 const uint8 new_state_number = 323 base::checked_cast<uint8>(states->size() - 1); 324 CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); 325 return new_state_number; 326} 327 328std::vector<State> GenerateStates(const PairVector& pairs) { 329 // States 0 and 1 are the initial/valid state and invalid state, respectively. 330 std::vector<State> states(2, GenerateInvalidState()); 331 StateMap state_map; 332 state_map.insert(std::make_pair(StringSet(), 0)); 333 for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { 334 DCHECK(it->character.empty()); 335 DCHECK(!it->set.empty()); 336 const Range& range = it->set.front(); 337 const StringSet rest(it->set.begin() + 1, it->set.end()); 338 const StateMap::const_iterator where = state_map.find(rest); 339 const uint8 target_state = where == state_map.end() 340 ? MakeState(rest, &states, &state_map) 341 : where->second; 342 if (states[0].back().from == range.from()) { 343 DCHECK_EQ(1, states[0].back().target_state); 344 states[0].back().target_state = target_state; 345 DCHECK_LT(range.to(), 0xFF); 346 const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; 347 states[0].push_back(new_range); 348 } else { 349 DCHECK_LT(range.to(), 0xFF); 350 const StateRange new_range_initializer[] = {{range.from(), target_state}, 351 {static_cast<uint8>(range.to() + 1), 1}}; 352 states[0] 353 .insert(states[0].end(), 354 new_range_initializer, 355 new_range_initializer + arraysize(new_range_initializer)); 356 } 357 } 358 return states; 359} 360 361// Output the generated states as a C++ table. Two tricks are used to compact 362// the table: each state in the table starts with a shift value which indicates 363// how many bits we can discard from the right-hand-side of the byte before 364// doing the table lookup. Secondly, only the state-transitions for bytes 365// with the top-bit set are included in the table; bytes without the top-bit set 366// are just ASCII and are handled directly by the code. 367void PrintStates(const std::vector<State>& states, FILE* stream) { 368 // First calculate the start-offset of each state. This allows the state 369 // machine to jump directly to the correct offset, avoiding an extra 370 // indirection. State 0 starts at offset 0. 371 std::vector<uint8> state_offset(1, 0); 372 std::vector<uint8> shifts; 373 uint8 pos = 0; 374 375 for (std::vector<State>::const_iterator state_it = states.begin(); 376 state_it != states.end(); 377 ++state_it) { 378 // We want to set |shift| to the (0-based) index of the least-significant 379 // set bit in any of the ranges for this state, since this tells us how many 380 // bits we can discard and still determine what range a byte lies in. Sadly 381 // it appears that ffs() is not portable, so we do it clumsily. 382 uint8 shift = 7; 383 for (State::const_iterator range_it = state_it->begin(); 384 range_it != state_it->end(); 385 ++range_it) { 386 while (shift > 0 && range_it->from % (1 << shift) != 0) { 387 --shift; 388 } 389 } 390 shifts.push_back(shift); 391 pos += 1 + (1 << (7 - shift)); 392 state_offset.push_back(pos); 393 } 394 395 DCHECK_EQ(129, state_offset[1]); 396 397 fputs(kProlog, stream); 398 TablePrinter table_printer(stream); 399 400 for (uint8 state_index = 0; state_index < states.size(); ++state_index) { 401 const uint8 shift = shifts[state_index]; 402 uint8 next_range = 0; 403 uint8 target_state = 1; 404 fprintf(stream, 405 " // State %d, offset 0x%02x\n", 406 static_cast<int>(state_index), 407 static_cast<int>(state_offset[state_index])); 408 table_printer.PrintValue(shift); 409 for (int i = 0; i < 0x100; i += (1 << shift)) { 410 if (next_range < states[state_index].size() && 411 states[state_index][next_range].from == i) { 412 target_state = states[state_index][next_range].target_state; 413 ++next_range; 414 } 415 if (i >= 0x80) { 416 table_printer.PrintValue(state_offset[target_state]); 417 } 418 } 419 table_printer.NewLine(); 420 } 421 422 fputs(kEpilog, stream); 423} 424 425} // namespace 426 427int main(int argc, char* argv[]) { 428 CommandLine::Init(argc, argv); 429 logging::LoggingSettings settings; 430 settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; 431 logging::InitLogging(settings); 432 if (CommandLine::ForCurrentProcess()->HasSwitch("help")) { 433 fwrite(kHelpText, 1, arraysize(kHelpText), stdout); 434 exit(EXIT_SUCCESS); 435 } 436 base::FilePath filename = 437 CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); 438 439 FILE* output = stdout; 440 if (!filename.empty()) { 441 output = base::OpenFile(filename, "wb"); 442 if (!output) 443 PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() 444 << "' for writing"; 445 } 446 447 // Step 1: Enumerate the characters 448 PairVector pairs = InitializeCharacters(); 449 // Step 2: Convert to sets. 450 MoveAllCharsToSets(&pairs); 451 if (VLOG_IS_ON(1)) { 452 LogStringSets(pairs); 453 } 454 // Step 3: Generate states. 455 std::vector<State> states = GenerateStates(pairs); 456 // Step 4/5: Print output 457 PrintStates(states, output); 458 459 if (!filename.empty()) { 460 if (!base::CloseFile(output)) 461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() 462 << "'"; 463 } 464 465 return EXIT_SUCCESS; 466} 467