1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/i18n/streaming_utf8_validator.h"
6
7#include <stdio.h>
8#include <string.h>
9
10#include <string>
11
12#include "base/strings/string_piece.h"
13#include "testing/gtest/include/gtest/gtest.h"
14
15// Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class
16// accepts exactly the same set of 4-byte strings as ICU-based validation. This
17// tests every possible 4-byte string, so it is too slow to run routinely on
18// low-powered machines.
19//
20// #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
21
22#ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
23
24#include "base/basictypes.h"
25#include "base/bind.h"
26#include "base/location.h"
27#include "base/logging.h"
28#include "base/memory/ref_counted.h"
29#include "base/strings/string_util.h"
30#include "base/strings/stringprintf.h"
31#include "base/strings/utf_string_conversion_utils.h"
32#include "base/synchronization/condition_variable.h"
33#include "base/synchronization/lock.h"
34#include "base/threading/sequenced_worker_pool.h"
35#include "third_party/icu/source/common/unicode/utf8.h"
36
37#endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
38
39namespace base {
40namespace {
41
42// Avoid having to qualify the enum values in the tests.
43const StreamingUtf8Validator::State VALID_ENDPOINT =
44    StreamingUtf8Validator::VALID_ENDPOINT;
45const StreamingUtf8Validator::State VALID_MIDPOINT =
46    StreamingUtf8Validator::VALID_MIDPOINT;
47const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
48
49#ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
50
51const uint32 kThoroughTestChunkSize = 1 << 24;
52
53class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
54 protected:
55  StreamingUtf8ValidatorThoroughTest()
56      : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {}
57
58  // This uses the same logic as base::IsStringUTF8 except it considers
59  // non-characters valid (and doesn't require a string as input).
60  static bool IsStringUtf8(const char* src, int32 src_len) {
61    int32 char_index = 0;
62
63    while (char_index < src_len) {
64      int32 code_point;
65      U8_NEXT(src, char_index, src_len, code_point);
66      if (!base::IsValidCodepoint(code_point))
67        return false;
68    }
69    return true;
70  }
71
72  // Converts the passed-in integer to a 4 byte string and then
73  // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
74  // whether it is valid UTF-8 or not.
75  void TestNumber(uint32 n) const {
76    char test[sizeof n];
77    memcpy(test, &n, sizeof n);
78    StreamingUtf8Validator validator;
79    EXPECT_EQ(IsStringUtf8(test, sizeof n),
80              validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
81        << "Difference of opinion for \""
82        << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
83                              test[0] & 0xFF,
84                              test[1] & 0xFF,
85                              test[2] & 0xFF,
86                              test[3] & 0xFF) << "\"";
87  }
88
89 public:
90  // Tests the 4-byte sequences corresponding to the |size| integers
91  // starting at |begin|. This is intended to be run from a worker
92  // pool. Signals |all_done_| at the end if it thinks all tasks are
93  // finished.
94  void TestRange(uint32 begin, uint32 size) {
95    for (uint32 i = 0; i < size; ++i) {
96      TestNumber(begin + i);
97    }
98    base::AutoLock al(lock_);
99    ++tasks_finished_;
100    LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
101              << " tasks done\n";
102    if (tasks_finished_ >= tasks_dispatched_) {
103      all_done_.Signal();
104    }
105  }
106
107 protected:
108  base::Lock lock_;
109  base::ConditionVariable all_done_;
110  int tasks_dispatched_;
111  int tasks_finished_;
112};
113
114TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) {
115  scoped_refptr<base::SequencedWorkerPool> pool =
116      new base::SequencedWorkerPool(32, "TestEverything");
117  base::AutoLock al(lock_);
118  uint32 begin = 0;
119  do {
120    pool->PostWorkerTask(
121        FROM_HERE,
122        base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange,
123                   base::Unretained(this),
124                   begin,
125                   kThoroughTestChunkSize));
126    ++tasks_dispatched_;
127    begin += kThoroughTestChunkSize;
128  } while (begin != 0);
129  while (tasks_finished_ < tasks_dispatched_)
130    all_done_.Wait();
131}
132
133#endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
134
135// These valid and invalid UTF-8 sequences are based on the tests from
136// base/strings/string_util_unittest.cc
137
138// All of the strings in |valid| must represent a single codepoint, because
139// partial sequences are constructed by taking non-empty prefixes of these
140// strings.
141const char* const valid[] = {"\r",           "\n",           "a",
142                             "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
143                             "\xef\xbb\xbf",  // UTF-8 BOM
144};
145
146const char* const* const valid_end = valid + arraysize(valid);
147
148const char* const invalid[] = {
149    // always invalid bytes
150    "\xc0", "\xc1",
151    "\xf5", "\xf6", "\xf7",
152    "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
153    // surrogate code points
154    "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
155    //
156    // overlong sequences
157    "\xc0\x80"               // U+0000
158    "\xc1\x80",              // "A"
159    "\xc1\x81",              // "B"
160    "\xe0\x80\x80",          // U+0000
161    "\xe0\x82\x80",          // U+0080
162    "\xe0\x9f\xbf",          // U+07ff
163    "\xf0\x80\x80\x8D",      // U+000D
164    "\xf0\x80\x82\x91",      // U+0091
165    "\xf0\x80\xa0\x80",      // U+0800
166    "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
167    "\xf8\x80\x80\x80\xbf",  // U+003F
168    "\xfc\x80\x80\x80\xa0\xa5",
169    //
170    // Beyond U+10FFFF
171    "\xf4\x90\x80\x80",          // U+110000
172    "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
173    "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
174    //
175    // BOMs in UTF-16(BE|LE)
176    "\xfe\xff", "\xff\xfe",
177};
178
179const char* const* const invalid_end = invalid + arraysize(invalid);
180
181// A ForwardIterator which returns all the non-empty prefixes of the elements of
182// "valid".
183class PartialIterator {
184 public:
185  // The constructor returns the first iterator, ie. it is equivalent to
186  // begin().
187  PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
188  // The trivial destructor left intentionally undefined.
189  // This is a value type; the default copy constructor and assignment operator
190  // generated by the compiler are used.
191
192  static PartialIterator end() { return PartialIterator(arraysize(valid), 1); }
193
194  PartialIterator& operator++() {
195    Advance();
196    return *this;
197  }
198
199  base::StringPiece operator*() const {
200    return base::StringPiece(valid[index_], prefix_length_);
201  }
202
203  bool operator==(const PartialIterator& rhs) const {
204    return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
205  }
206
207  bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
208
209 private:
210  // This constructor is used by the end() method.
211  PartialIterator(size_t index, size_t prefix_length)
212      : index_(index), prefix_length_(prefix_length) {}
213
214  void Advance() {
215    if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_]))
216      ++prefix_length_;
217    while (index_ < arraysize(valid) &&
218           prefix_length_ == strlen(valid[index_])) {
219      ++index_;
220      prefix_length_ = 1;
221    }
222  }
223
224  // The UTF-8 sequence, as an offset into the |valid| array.
225  size_t index_;
226  size_t prefix_length_;
227};
228
229// A test fixture for tests which test one UTF-8 sequence (or invalid
230// byte sequence) at a time.
231class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
232 protected:
233  // Iterator must be convertible when de-referenced to StringPiece.
234  template <typename Iterator>
235  void CheckRange(Iterator begin,
236                  Iterator end,
237                  StreamingUtf8Validator::State expected) {
238    for (Iterator it = begin; it != end; ++it) {
239      StreamingUtf8Validator validator;
240      base::StringPiece sequence = *it;
241      EXPECT_EQ(expected,
242                validator.AddBytes(sequence.data(), sequence.size()))
243          << "Failed for \"" << sequence << "\"";
244    }
245  }
246
247  // Adding input a byte at a time should make absolutely no difference.
248  template <typename Iterator>
249  void CheckRangeByteAtATime(Iterator begin,
250                             Iterator end,
251                             StreamingUtf8Validator::State expected) {
252    for (Iterator it = begin; it != end; ++it) {
253      StreamingUtf8Validator validator;
254      base::StringPiece sequence = *it;
255      StreamingUtf8Validator::State state = VALID_ENDPOINT;
256      for (base::StringPiece::const_iterator cit = sequence.begin();
257           cit != sequence.end();
258           ++cit) {
259        state = validator.AddBytes(&*cit, 1);
260      }
261      EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
262    }
263  }
264};
265
266// A test fixture for tests which test the concatenation of byte sequences.
267class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
268 protected:
269  // Check every possible concatenation of byte sequences from two
270  // ranges, and verify that the combination matches the expected
271  // state.
272  template <typename Iterator1, typename Iterator2>
273  void CheckCombinations(Iterator1 begin1,
274                         Iterator1 end1,
275                         Iterator2 begin2,
276                         Iterator2 end2,
277                         StreamingUtf8Validator::State expected) {
278    StreamingUtf8Validator validator;
279    for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
280      base::StringPiece c1 = *it1;
281      for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
282        base::StringPiece c2 = *it2;
283        validator.AddBytes(c1.data(), c1.size());
284        EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
285            << "Failed for \"" << c1 << c2 << "\"";
286        validator.Reset();
287      }
288    }
289  }
290};
291
292TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
293  static const char kNothing[] = "";
294  EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
295}
296
297// Because the members of the |valid| array need to be non-zero length
298// sequences and are measured with strlen(), |valid| cannot be used it
299// to test the NUL character '\0', so the NUL character gets its own
300// test.
301TEST(StreamingUtf8ValidatorTest, NulIsValid) {
302  static const char kNul[] = "\x00";
303  EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
304}
305
306// Just a basic sanity test before we start getting fancy.
307TEST(StreamingUtf8ValidatorTest, HelloWorld) {
308  static const char kHelloWorld[] = "Hello, World!";
309  EXPECT_EQ(
310      VALID_ENDPOINT,
311      StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
312}
313
314// Check that the Reset() method works.
315TEST(StreamingUtf8ValidatorTest, ResetWorks) {
316  StreamingUtf8Validator validator;
317  EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
318  EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
319  validator.Reset();
320  EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
321}
322
323TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
324  CheckRange(valid, valid_end, VALID_ENDPOINT);
325}
326
327TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
328  CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
329}
330
331TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
332  CheckRange(invalid, invalid_end, INVALID);
333}
334
335TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
336  CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
337}
338
339TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
340  CheckRangeByteAtATime(
341      PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
342}
343
344TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
345  CheckRangeByteAtATime(invalid, invalid_end, INVALID);
346}
347
348TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
349  CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
350}
351
352TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
353  CheckCombinations(valid,
354                    valid_end,
355                    PartialIterator(),
356                    PartialIterator::end(),
357                    VALID_MIDPOINT);
358}
359
360TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
361  CheckCombinations(
362      PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
363}
364
365TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
366  CheckCombinations(PartialIterator(),
367                    PartialIterator::end(),
368                    PartialIterator(),
369                    PartialIterator::end(),
370                    INVALID);
371}
372
373TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
374  CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
375}
376
377TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
378  CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
379}
380
381TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
382  CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
383}
384
385TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
386  CheckCombinations(
387      invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
388}
389
390TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
391  CheckCombinations(
392      PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
393}
394
395TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
396  EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
397}
398
399TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
400  EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
401}
402
403TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
404  EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
405}
406
407TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
408  EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
409}
410
411}  // namespace
412}  // namespace base
413