15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved. 25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file. 45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A streaming validator for UTF-8. Validation is based on the definition in 65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// RFC-3629. In particular, it does not reject the invalid characters rejected 75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// by base::IsStringUTF8(). 85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// The implementation detects errors on the first possible byte. 105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string> 155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/basictypes.h" 175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/base_i18n_export.h" 185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace base { 205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class BASE_I18N_EXPORT StreamingUtf8Validator { 225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public: 235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it 245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // processes characters it alternates between VALID_ENDPOINT and 255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the 265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // state changes permanently to INVALID. 275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) enum State { 285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VALID_ENDPOINT, 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VALID_MIDPOINT, 305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) INVALID 315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) }; 325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) StreamingUtf8Validator() : state_(0u) {} 345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Trivial destructor intentionally omitted. 355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Validate |size| bytes starting at |data|. If the concatenation of all calls 375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // to AddBytes() since this object was constructed or reset is a valid UTF-8 385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8 395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was 405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // present, returns INVALID. 415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) State AddBytes(const char* data, size_t size); 425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Return the object to a freshly-constructed state so that it can be re-used. 445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) void Reset(); 455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Validate a complete string using the same criteria. Returns true if the 475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // string only contains complete, valid UTF-8 codepoints. 485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) static bool Validate(const std::string& string); 495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private: 515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // The current state of the validator. Value 0 is the initial/valid state. 525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // The state is stored as an offset into |kUtf8ValidatorTables|. The special 535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // state |kUtf8InvalidState| is invalid. 545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 state_; 555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // This type could be made copyable but there is currently no use-case for 575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // it. 585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator); 595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}; 605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} // namespace base 625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#endif // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 64