15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file.
45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// A streaming validator for UTF-8. Validation is based on the definition in
65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// RFC-3629. In particular, it does not reject the invalid characters rejected
75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// by base::IsStringUTF8().
85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// The implementation detects errors on the first possible byte.
105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <string>
155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/basictypes.h"
175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/base_i18n_export.h"
185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace base {
205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class BASE_I18N_EXPORT StreamingUtf8Validator {
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) public:
235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // processes characters it alternates between VALID_ENDPOINT and
255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // state changes permanently to INVALID.
275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  enum State {
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    VALID_ENDPOINT,
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    VALID_MIDPOINT,
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    INVALID
315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  };
325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  StreamingUtf8Validator() : state_(0u) {}
345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Trivial destructor intentionally omitted.
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Validate |size| bytes starting at |data|. If the concatenation of all calls
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // to AddBytes() since this object was constructed or reset is a valid UTF-8
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // present, returns INVALID.
415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  State AddBytes(const char* data, size_t size);
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Return the object to a freshly-constructed state so that it can be re-used.
445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  void Reset();
455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Validate a complete string using the same criteria. Returns true if the
475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // string only contains complete, valid UTF-8 codepoints.
485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  static bool Validate(const std::string& string);
495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) private:
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // The current state of the validator. Value 0 is the initial/valid state.
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // The state is stored as an offset into |kUtf8ValidatorTables|. The special
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // state |kUtf8InvalidState| is invalid.
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 state_;
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // This type could be made copyable but there is currently no use-case for
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // it.
585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator);
595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)};
605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}  // namespace base
625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#endif  // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
64