1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// All data that is passed through a WebSocket with type "Text" needs to be
6// validated as UTF8. Since this is done on the IO thread, it needs to be
7// reasonably fast.
8
9// We are only interested in the performance on valid UTF8. Invalid UTF8 will
10// result in a connection failure, so is unlikely to become a source of
11// performance issues.
12
13#include "base/i18n/streaming_utf8_validator.h"
14
15#include <string>
16
17#include "base/basictypes.h"
18#include "base/bind.h"
19#include "base/callback.h"
20#include "base/strings/string_util.h"
21#include "base/strings/stringprintf.h"
22#include "base/test/perf_time_logger.h"
23#include "testing/gtest/include/gtest/gtest.h"
24
25namespace base {
26namespace {
27
28// We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
29// They are intended to be large enough that the validator needs to do
30// meaningful work while being in some sense "realistic" (eg. control characters
31// are not included).
32const char kOneByteSeqRangeStart[] = " ";  // U+0020
33const char kOneByteSeqRangeEnd[] = "~";    // U+007E
34
35const char kTwoByteSeqRangeStart[] = "\xc2\xa0";  // U+00A0 non-breaking space
36const char kTwoByteSeqRangeEnd[] = "\xc9\x8f";    // U+024F small y with stroke
37
38const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82";  // U+3042 Hiragana "a"
39const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83";    // U+9FC3 "to blink"
40
41const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b";  // U+2000B
42const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2";    // U+2A6B2
43
44// The different lengths of strings to test.
45const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20};
46
47// Simplest possible byte-at-a-time validator, to provide a baseline
48// for comparison. This is only tried on 1-byte UTF-8 sequences, as
49// the results will not be meaningful with sequences containing
50// top-bit-set bytes.
51bool IsString7Bit(const std::string& s) {
52  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
53    if (*it & 0x80)
54      return false;
55  }
56  return true;
57}
58
59// Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
60// the next one. Is just barely smart enough to iterate through the ranges
61// defined about.
62std::string NextUtf8Sequence(const std::string& previous) {
63  DCHECK(StreamingUtf8Validator::Validate(previous));
64  std::string next = previous;
65  for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
66    // All bytes in a UTF-8 sequence except the first one are
67    // constrained to the range 0x80 to 0xbf, inclusive. When we
68    // increment past 0xbf, we carry into the previous byte.
69    if (i > 0 && next[i] == '\xbf') {
70      next[i] = '\x80';
71      continue;  // carry
72    }
73    ++next[i];
74    break;  // no carry
75  }
76  DCHECK(StreamingUtf8Validator::Validate(next))
77      << "Result \"" << next << "\" failed validation";
78  return next;
79}
80
81typedef bool (*TestTargetType)(const std::string&);
82
83// Run fuction |target| over |test_string| |times| times, and report the results
84// using |description|.
85bool RunTest(const std::string& description,
86             TestTargetType target,
87             const std::string& test_string,
88             int times) {
89  base::PerfTimeLogger timer(description.c_str());
90  bool result = true;
91  for (int i = 0; i < times; ++i) {
92    result = target(test_string) && result;
93  }
94  timer.Done();
95  return result;
96}
97
98// Construct a string by repeating |input| enough times to equal or exceed
99// |length|.
100std::string ConstructRepeatedTestString(const std::string& input,
101                                        size_t length) {
102  std::string output = input;
103  while (output.length() * 2 < length) {
104    output += output;
105  }
106  if (output.length() < length) {
107    output += ConstructRepeatedTestString(input, length - output.length());
108  }
109  return output;
110}
111
112// Construct a string by expanding the range of UTF-8 sequences
113// between |input_start| and |input_end|, inclusive, and then
114// repeating the resulting string until it equals or exceeds |length|
115// bytes. |input_start| and |input_end| must be valid UTF-8
116// sequences.
117std::string ConstructRangedTestString(const std::string& input_start,
118                                      const std::string& input_end,
119                                      size_t length) {
120  std::string output = input_start;
121  std::string input = input_start;
122  while (output.length() < length && input != input_end) {
123    input = NextUtf8Sequence(input);
124    output += input;
125  }
126  if (output.length() < length) {
127    output = ConstructRepeatedTestString(output, length);
128  }
129  return output;
130}
131
132struct TestFunctionDescription {
133  TestTargetType function;
134  const char* function_name;
135};
136
137// IsString7Bit is intentionally placed last so it can be excluded easily.
138const TestFunctionDescription kTestFunctions[] = {
139    {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"},
140    {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}};
141
142// Construct a test string from |construct_test_string| for each of the lengths
143// in |kTestLengths| in turn. For each string, run each test in |test_functions|
144// for a number of iterations such that the total number of bytes validated
145// is around 16MB.
146void RunSomeTests(
147    const char format[],
148    base::Callback<std::string(size_t length)> construct_test_string,
149    const TestFunctionDescription* test_functions,
150    size_t test_count) {
151  for (size_t i = 0; i < arraysize(kTestLengths); ++i) {
152    const size_t length = kTestLengths[i];
153    const std::string test_string = construct_test_string.Run(length);
154    const int real_length = static_cast<int>(test_string.length());
155    const int times = (1 << 24) / real_length;
156    for (size_t test_index = 0; test_index < test_count; ++test_index) {
157      EXPECT_TRUE(RunTest(StringPrintf(format,
158                                       test_functions[test_index].function_name,
159                                       real_length,
160                                       times),
161                          test_functions[test_index].function,
162                          test_string,
163                          times));
164    }
165  }
166}
167
168TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) {
169  RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
170               base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart),
171               kTestFunctions,
172               3);
173}
174
175TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) {
176  RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
177               base::Bind(ConstructRangedTestString,
178                          kOneByteSeqRangeStart,
179                          kOneByteSeqRangeEnd),
180               kTestFunctions,
181               3);
182}
183
184TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) {
185  RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
186               base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
187               kTestFunctions,
188               2);
189}
190
191TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) {
192  RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
193               base::Bind(ConstructRangedTestString,
194                          kTwoByteSeqRangeStart,
195                          kTwoByteSeqRangeEnd),
196               kTestFunctions,
197               2);
198}
199
200TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) {
201  RunSomeTests(
202      "%s: bytes=3 repeated length=%d repeat=%d",
203      base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
204      kTestFunctions,
205      2);
206}
207
208TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) {
209  RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
210               base::Bind(ConstructRangedTestString,
211                          kThreeByteSeqRangeStart,
212                          kThreeByteSeqRangeEnd),
213               kTestFunctions,
214               2);
215}
216
217TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) {
218  RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
219               base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart),
220               kTestFunctions,
221               2);
222}
223
224TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) {
225  RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
226               base::Bind(ConstructRangedTestString,
227                          kFourByteSeqRangeStart,
228                          kFourByteSeqRangeEnd),
229               kTestFunctions,
230               2);
231}
232
233}  // namespace
234}  // namespace base
235