1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // All data that is passed through a WebSocket with type "Text" needs to be
6 // validated as UTF8. Since this is done on the IO thread, it needs to be
7 // reasonably fast.
8 
9 // We are only interested in the performance on valid UTF8. Invalid UTF8 will
10 // result in a connection failure, so is unlikely to become a source of
11 // performance issues.
12 
13 #include "base/i18n/streaming_utf8_validator.h"
14 
15 #include <stddef.h>
16 
17 #include <string>
18 
19 #include "base/bind.h"
20 #include "base/callback.h"
21 #include "base/macros.h"
22 #include "base/strings/string_util.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/test/perf_time_logger.h"
25 #include "testing/gtest/include/gtest/gtest.h"
26 
27 namespace base {
28 namespace {
29 
30 // We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
31 // They are intended to be large enough that the validator needs to do
32 // meaningful work while being in some sense "realistic" (eg. control characters
33 // are not included).
34 const char kOneByteSeqRangeStart[] = " ";  // U+0020
35 const char kOneByteSeqRangeEnd[] = "~";    // U+007E
36 
37 const char kTwoByteSeqRangeStart[] = "\xc2\xa0";  // U+00A0 non-breaking space
38 const char kTwoByteSeqRangeEnd[] = "\xc9\x8f";    // U+024F small y with stroke
39 
40 const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82";  // U+3042 Hiragana "a"
41 const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83";    // U+9FC3 "to blink"
42 
43 const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b";  // U+2000B
44 const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2";    // U+2A6B2
45 
46 // The different lengths of strings to test.
47 const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20};
48 
49 // Simplest possible byte-at-a-time validator, to provide a baseline
50 // for comparison. This is only tried on 1-byte UTF-8 sequences, as
51 // the results will not be meaningful with sequences containing
52 // top-bit-set bytes.
IsString7Bit(const std::string & s)53 bool IsString7Bit(const std::string& s) {
54   for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
55     if (*it & 0x80)
56       return false;
57   }
58   return true;
59 }
60 
61 // Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
62 // the next one. Is just barely smart enough to iterate through the ranges
63 // defined about.
NextUtf8Sequence(const std::string & previous)64 std::string NextUtf8Sequence(const std::string& previous) {
65   DCHECK(StreamingUtf8Validator::Validate(previous));
66   std::string next = previous;
67   for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
68     // All bytes in a UTF-8 sequence except the first one are
69     // constrained to the range 0x80 to 0xbf, inclusive. When we
70     // increment past 0xbf, we carry into the previous byte.
71     if (i > 0 && next[i] == '\xbf') {
72       next[i] = '\x80';
73       continue;  // carry
74     }
75     ++next[i];
76     break;  // no carry
77   }
78   DCHECK(StreamingUtf8Validator::Validate(next))
79       << "Result \"" << next << "\" failed validation";
80   return next;
81 }
82 
83 typedef bool (*TestTargetType)(const std::string&);
84 
85 // Run fuction |target| over |test_string| |times| times, and report the results
86 // using |description|.
RunTest(const std::string & description,TestTargetType target,const std::string & test_string,int times)87 bool RunTest(const std::string& description,
88              TestTargetType target,
89              const std::string& test_string,
90              int times) {
91   base::PerfTimeLogger timer(description.c_str());
92   bool result = true;
93   for (int i = 0; i < times; ++i) {
94     result = target(test_string) && result;
95   }
96   timer.Done();
97   return result;
98 }
99 
100 // Construct a string by repeating |input| enough times to equal or exceed
101 // |length|.
ConstructRepeatedTestString(const std::string & input,size_t length)102 std::string ConstructRepeatedTestString(const std::string& input,
103                                         size_t length) {
104   std::string output = input;
105   while (output.length() * 2 < length) {
106     output += output;
107   }
108   if (output.length() < length) {
109     output += ConstructRepeatedTestString(input, length - output.length());
110   }
111   return output;
112 }
113 
114 // Construct a string by expanding the range of UTF-8 sequences
115 // between |input_start| and |input_end|, inclusive, and then
116 // repeating the resulting string until it equals or exceeds |length|
117 // bytes. |input_start| and |input_end| must be valid UTF-8
118 // sequences.
ConstructRangedTestString(const std::string & input_start,const std::string & input_end,size_t length)119 std::string ConstructRangedTestString(const std::string& input_start,
120                                       const std::string& input_end,
121                                       size_t length) {
122   std::string output = input_start;
123   std::string input = input_start;
124   while (output.length() < length && input != input_end) {
125     input = NextUtf8Sequence(input);
126     output += input;
127   }
128   if (output.length() < length) {
129     output = ConstructRepeatedTestString(output, length);
130   }
131   return output;
132 }
133 
134 struct TestFunctionDescription {
135   TestTargetType function;
136   const char* function_name;
137 };
138 
IsStringUTF8(const std::string & str)139 bool IsStringUTF8(const std::string& str) {
140   return base::IsStringUTF8(base::StringPiece(str));
141 }
142 
143 // IsString7Bit is intentionally placed last so it can be excluded easily.
144 const TestFunctionDescription kTestFunctions[] = {
145     {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"},
146     {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}};
147 
148 // Construct a test string from |construct_test_string| for each of the lengths
149 // in |kTestLengths| in turn. For each string, run each test in |test_functions|
150 // for a number of iterations such that the total number of bytes validated
151 // is around 16MB.
RunSomeTests(const char format[],base::Callback<std::string (size_t length)> construct_test_string,const TestFunctionDescription * test_functions,size_t test_count)152 void RunSomeTests(
153     const char format[],
154     base::Callback<std::string(size_t length)> construct_test_string,
155     const TestFunctionDescription* test_functions,
156     size_t test_count) {
157   for (size_t i = 0; i < arraysize(kTestLengths); ++i) {
158     const size_t length = kTestLengths[i];
159     const std::string test_string = construct_test_string.Run(length);
160     const int real_length = static_cast<int>(test_string.length());
161     const int times = (1 << 24) / real_length;
162     for (size_t test_index = 0; test_index < test_count; ++test_index) {
163       EXPECT_TRUE(RunTest(StringPrintf(format,
164                                        test_functions[test_index].function_name,
165                                        real_length,
166                                        times),
167                           test_functions[test_index].function,
168                           test_string,
169                           times));
170     }
171   }
172 }
173 
TEST(StreamingUtf8ValidatorPerfTest,OneByteRepeated)174 TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) {
175   RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
176                base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart),
177                kTestFunctions,
178                3);
179 }
180 
TEST(StreamingUtf8ValidatorPerfTest,OneByteRange)181 TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) {
182   RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
183                base::Bind(ConstructRangedTestString,
184                           kOneByteSeqRangeStart,
185                           kOneByteSeqRangeEnd),
186                kTestFunctions,
187                3);
188 }
189 
TEST(StreamingUtf8ValidatorPerfTest,TwoByteRepeated)190 TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) {
191   RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
192                base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
193                kTestFunctions,
194                2);
195 }
196 
TEST(StreamingUtf8ValidatorPerfTest,TwoByteRange)197 TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) {
198   RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
199                base::Bind(ConstructRangedTestString,
200                           kTwoByteSeqRangeStart,
201                           kTwoByteSeqRangeEnd),
202                kTestFunctions,
203                2);
204 }
205 
TEST(StreamingUtf8ValidatorPerfTest,ThreeByteRepeated)206 TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) {
207   RunSomeTests(
208       "%s: bytes=3 repeated length=%d repeat=%d",
209       base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
210       kTestFunctions,
211       2);
212 }
213 
TEST(StreamingUtf8ValidatorPerfTest,ThreeByteRange)214 TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) {
215   RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
216                base::Bind(ConstructRangedTestString,
217                           kThreeByteSeqRangeStart,
218                           kThreeByteSeqRangeEnd),
219                kTestFunctions,
220                2);
221 }
222 
TEST(StreamingUtf8ValidatorPerfTest,FourByteRepeated)223 TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) {
224   RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
225                base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart),
226                kTestFunctions,
227                2);
228 }
229 
TEST(StreamingUtf8ValidatorPerfTest,FourByteRange)230 TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) {
231   RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
232                base::Bind(ConstructRangedTestString,
233                           kFourByteSeqRangeStart,
234                           kFourByteSeqRangeEnd),
235                kTestFunctions,
236                2);
237 }
238 
239 }  // namespace
240 }  // namespace base
241