streaming_utf8_validator.h revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// A streaming validator for UTF-8. Validation is based on the definition in
6// RFC-3629. In particular, it does not reject the invalid characters rejected
7// by base::IsStringUTF8().
8//
9// The implementation detects errors on the first possible byte.
10
11#ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
12#define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
13
14#include <string>
15
16#include "base/basictypes.h"
17#include "base/i18n/base_i18n_export.h"
18
19namespace base {
20
21class BASE_I18N_EXPORT StreamingUtf8Validator {
22 public:
23  // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
24  // processes characters it alternates between VALID_ENDPOINT and
25  // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
26  // state changes permanently to INVALID.
27  enum State {
28    VALID_ENDPOINT,
29    VALID_MIDPOINT,
30    INVALID
31  };
32
33  StreamingUtf8Validator() : state_(0u) {}
34  // Trivial destructor intentionally omitted.
35
36  // Validate |size| bytes starting at |data|. If the concatenation of all calls
37  // to AddBytes() since this object was constructed or reset is a valid UTF-8
38  // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
39  // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
40  // present, returns INVALID.
41  State AddBytes(const char* data, size_t size);
42
43  // Return the object to a freshly-constructed state so that it can be re-used.
44  void Reset();
45
46  // Validate a complete string using the same criteria. Returns true if the
47  // string only contains complete, valid UTF-8 codepoints.
48  static bool Validate(const std::string& string);
49
50 private:
51  // The current state of the validator. Value 0 is the initial/valid state.
52  // The state is stored as an offset into |kUtf8ValidatorTables|. The special
53  // state |kUtf8InvalidState| is invalid.
54  uint8 state_;
55
56  // This type could be made copyable but there is currently no use-case for
57  // it.
58  DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator);
59};
60
61}  // namespace base
62
63#endif  // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
64