1// Copyright 2014 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// All data that is passed through a WebSocket with type "Text" needs to be 6// validated as UTF8. Since this is done on the IO thread, it needs to be 7// reasonably fast. 8 9// We are only interested in the performance on valid UTF8. Invalid UTF8 will 10// result in a connection failure, so is unlikely to become a source of 11// performance issues. 12 13#include "base/i18n/streaming_utf8_validator.h" 14 15#include <string> 16 17#include "base/basictypes.h" 18#include "base/bind.h" 19#include "base/callback.h" 20#include "base/strings/string_util.h" 21#include "base/strings/stringprintf.h" 22#include "base/test/perf_time_logger.h" 23#include "testing/gtest/include/gtest/gtest.h" 24 25namespace base { 26namespace { 27 28// We want to test ranges of valid UTF-8 sequences. These ranges are inclusive. 29// They are intended to be large enough that the validator needs to do 30// meaningful work while being in some sense "realistic" (eg. control characters 31// are not included). 32const char kOneByteSeqRangeStart[] = " "; // U+0020 33const char kOneByteSeqRangeEnd[] = "~"; // U+007E 34 35const char kTwoByteSeqRangeStart[] = "\xc2\xa0"; // U+00A0 non-breaking space 36const char kTwoByteSeqRangeEnd[] = "\xc9\x8f"; // U+024F small y with stroke 37 38const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82"; // U+3042 Hiragana "a" 39const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83"; // U+9FC3 "to blink" 40 41const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b"; // U+2000B 42const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2"; // U+2A6B2 43 44// The different lengths of strings to test. 45const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20}; 46 47// Simplest possible byte-at-a-time validator, to provide a baseline 48// for comparison. This is only tried on 1-byte UTF-8 sequences, as 49// the results will not be meaningful with sequences containing 50// top-bit-set bytes. 51bool IsString7Bit(const std::string& s) { 52 for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) { 53 if (*it & 0x80) 54 return false; 55 } 56 return true; 57} 58 59// Assumes that |previous| is a valid UTF-8 sequence, and attempts to return 60// the next one. Is just barely smart enough to iterate through the ranges 61// defined about. 62std::string NextUtf8Sequence(const std::string& previous) { 63 DCHECK(StreamingUtf8Validator::Validate(previous)); 64 std::string next = previous; 65 for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) { 66 // All bytes in a UTF-8 sequence except the first one are 67 // constrained to the range 0x80 to 0xbf, inclusive. When we 68 // increment past 0xbf, we carry into the previous byte. 69 if (i > 0 && next[i] == '\xbf') { 70 next[i] = '\x80'; 71 continue; // carry 72 } 73 ++next[i]; 74 break; // no carry 75 } 76 DCHECK(StreamingUtf8Validator::Validate(next)) 77 << "Result \"" << next << "\" failed validation"; 78 return next; 79} 80 81typedef bool (*TestTargetType)(const std::string&); 82 83// Run fuction |target| over |test_string| |times| times, and report the results 84// using |description|. 85bool RunTest(const std::string& description, 86 TestTargetType target, 87 const std::string& test_string, 88 int times) { 89 base::PerfTimeLogger timer(description.c_str()); 90 bool result = true; 91 for (int i = 0; i < times; ++i) { 92 result = target(test_string) && result; 93 } 94 timer.Done(); 95 return result; 96} 97 98// Construct a string by repeating |input| enough times to equal or exceed 99// |length|. 100std::string ConstructRepeatedTestString(const std::string& input, 101 size_t length) { 102 std::string output = input; 103 while (output.length() * 2 < length) { 104 output += output; 105 } 106 if (output.length() < length) { 107 output += ConstructRepeatedTestString(input, length - output.length()); 108 } 109 return output; 110} 111 112// Construct a string by expanding the range of UTF-8 sequences 113// between |input_start| and |input_end|, inclusive, and then 114// repeating the resulting string until it equals or exceeds |length| 115// bytes. |input_start| and |input_end| must be valid UTF-8 116// sequences. 117std::string ConstructRangedTestString(const std::string& input_start, 118 const std::string& input_end, 119 size_t length) { 120 std::string output = input_start; 121 std::string input = input_start; 122 while (output.length() < length && input != input_end) { 123 input = NextUtf8Sequence(input); 124 output += input; 125 } 126 if (output.length() < length) { 127 output = ConstructRepeatedTestString(output, length); 128 } 129 return output; 130} 131 132struct TestFunctionDescription { 133 TestTargetType function; 134 const char* function_name; 135}; 136 137// IsString7Bit is intentionally placed last so it can be excluded easily. 138const TestFunctionDescription kTestFunctions[] = { 139 {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"}, 140 {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}}; 141 142// Construct a test string from |construct_test_string| for each of the lengths 143// in |kTestLengths| in turn. For each string, run each test in |test_functions| 144// for a number of iterations such that the total number of bytes validated 145// is around 16MB. 146void RunSomeTests( 147 const char format[], 148 base::Callback<std::string(size_t length)> construct_test_string, 149 const TestFunctionDescription* test_functions, 150 size_t test_count) { 151 for (size_t i = 0; i < arraysize(kTestLengths); ++i) { 152 const size_t length = kTestLengths[i]; 153 const std::string test_string = construct_test_string.Run(length); 154 const int real_length = static_cast<int>(test_string.length()); 155 const int times = (1 << 24) / real_length; 156 for (size_t test_index = 0; test_index < test_count; ++test_index) { 157 EXPECT_TRUE(RunTest(StringPrintf(format, 158 test_functions[test_index].function_name, 159 real_length, 160 times), 161 test_functions[test_index].function, 162 test_string, 163 times)); 164 } 165 } 166} 167 168TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) { 169 RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d", 170 base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart), 171 kTestFunctions, 172 3); 173} 174 175TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) { 176 RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d", 177 base::Bind(ConstructRangedTestString, 178 kOneByteSeqRangeStart, 179 kOneByteSeqRangeEnd), 180 kTestFunctions, 181 3); 182} 183 184TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) { 185 RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d", 186 base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart), 187 kTestFunctions, 188 2); 189} 190 191TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) { 192 RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d", 193 base::Bind(ConstructRangedTestString, 194 kTwoByteSeqRangeStart, 195 kTwoByteSeqRangeEnd), 196 kTestFunctions, 197 2); 198} 199 200TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) { 201 RunSomeTests( 202 "%s: bytes=3 repeated length=%d repeat=%d", 203 base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart), 204 kTestFunctions, 205 2); 206} 207 208TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) { 209 RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d", 210 base::Bind(ConstructRangedTestString, 211 kThreeByteSeqRangeStart, 212 kThreeByteSeqRangeEnd), 213 kTestFunctions, 214 2); 215} 216 217TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) { 218 RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d", 219 base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart), 220 kTestFunctions, 221 2); 222} 223 224TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) { 225 RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d", 226 base::Bind(ConstructRangedTestString, 227 kFourByteSeqRangeStart, 228 kFourByteSeqRangeEnd), 229 kTestFunctions, 230 2); 231} 232 233} // namespace 234} // namespace base 235