1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/string_util.h"
6 
7 #include <math.h>
8 #include <stdarg.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 
12 #include <algorithm>
13 
14 #include <gmock/gmock.h>
15 #include <gtest/gtest.h>
16 
17 #include "base/macros.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 
20 using ::testing::ElementsAre;
21 
22 namespace base {
23 
TEST(StringUtilTest,IsStringUTF8)24 TEST(StringUtilTest, IsStringUTF8) {
25   EXPECT_TRUE(IsStringUTF8("abc"));
26   EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
27   EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
28   EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
29   EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
30   EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc"));  // UTF-8 BOM
31 
32   // surrogate code points
33   EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
34   EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));
35   EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));
36 
37   // overlong sequences
38   EXPECT_FALSE(IsStringUTF8("\xc0\x80"));  // U+0000
39   EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81"));  // "AB"
40   EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80"));  // U+0000
41   EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80"));  // U+0080
42   EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf"));  // U+07ff
43   EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D"));  // U+000D
44   EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91"));  // U+0091
45   EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80"));  // U+0800
46   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf"));  // U+FEFF (BOM)
47   EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf"));  // U+003F
48   EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5"));  // U+00A5
49 
50   // Beyond U+10FFFF (the upper limit of Unicode codespace)
51   EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80"));  // U+110000
52   EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf"));  // 5 bytes
53   EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80"));  // 6 bytes
54 
55   // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
56   EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
57   EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
58   EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
59   EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));
60 
61   // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
62   EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe"));  // U+FFFE)
63   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe"));  // U+1FFFE
64   EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf"));  // U+10FFFF
65   EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90"));  // U+FDD0
66   EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf"));  // U+FDEF
67   // Strings in legacy encodings. We can certainly make up strings
68   // in a legacy encoding that are valid in UTF-8, but in real data,
69   // most of them are invalid as UTF-8.
70   EXPECT_FALSE(IsStringUTF8("caf\xe9"));  // cafe with U+00E9 in ISO-8859-1
71   EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2"));  // U+AC00, U+AC001 in EUC-KR
72   EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e"));  // U+4F60 U+597D in Big5
73   // "abc" with U+201[CD] in windows-125[0-8]
74   EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));
75   // U+0639 U+064E U+0644 U+064E in ISO-8859-6
76   EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));
77   // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
78   EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));
79 
80   // Check that we support Embedded Nulls. The first uses the canonical UTF-8
81   // representation, and the second uses a 2-byte sequence. The second version
82   // is invalid UTF-8 since UTF-8 states that the shortest encoding for a
83   // given codepoint must be used.
84   static const char kEmbeddedNull[] = "embedded\0null";
85   EXPECT_TRUE(IsStringUTF8(
86       std::string(kEmbeddedNull, sizeof(kEmbeddedNull))));
87   EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
88 }
89 
TEST(StringUtilTest,IsStringASCII)90 TEST(StringUtilTest, IsStringASCII) {
91   static char char_ascii[] =
92       "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
93   static std::wstring wchar_ascii(
94       L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF");
95 
96   // Test a variety of the fragment start positions and lengths in order to make
97   // sure that bit masking in IsStringASCII works correctly.
98   // Also, test that a non-ASCII character will be detected regardless of its
99   // position inside the string.
100   {
101     const size_t string_length = arraysize(char_ascii) - 1;
102     for (size_t offset = 0; offset < 8; ++offset) {
103       for (size_t len = 0, max_len = string_length - offset; len < max_len;
104            ++len) {
105         EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
106         for (size_t char_pos = offset; char_pos < len; ++char_pos) {
107           char_ascii[char_pos] |= '\x80';
108           EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
109           char_ascii[char_pos] &= ~'\x80';
110         }
111       }
112     }
113   }
114 }
115 
TEST(StringUtilTest,ReplaceChars)116 TEST(StringUtilTest, ReplaceChars) {
117   struct TestData {
118     const char* input;
119     const char* replace_chars;
120     const char* replace_with;
121     const char* output;
122     bool result;
123   } cases[] = {
124     { "", "", "", "", false },
125     { "test", "", "", "test", false },
126     { "test", "", "!", "test", false },
127     { "test", "z", "!", "test", false },
128     { "test", "e", "!", "t!st", true },
129     { "test", "e", "!?", "t!?st", true },
130     { "test", "ez", "!", "t!st", true },
131     { "test", "zed", "!?", "t!?st", true },
132     { "test", "t", "!?", "!?es!?", true },
133     { "test", "et", "!>", "!>!>s!>", true },
134     { "test", "zest", "!", "!!!!", true },
135     { "test", "szt", "!", "!e!!", true },
136     { "test", "t", "test", "testestest", true },
137   };
138 
139   for (size_t i = 0; i < arraysize(cases); ++i) {
140     std::string output;
141     bool result = ReplaceChars(cases[i].input,
142                                cases[i].replace_chars,
143                                cases[i].replace_with,
144                                &output);
145     EXPECT_EQ(cases[i].result, result);
146     EXPECT_EQ(cases[i].output, output);
147   }
148 }
149 
150 }  // namespace base
151