1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/Support/Format.h"
12 #include "gtest/gtest.h"
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 using namespace llvm;
18
TEST(ConvertUTFTest,ConvertUTF16LittleEndianToUTF8String)19 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
20 // Src is the look of disapproval.
21 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
22 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
23 std::string Result;
24 bool Success = convertUTF16ToUTF8String(Ref, Result);
25 EXPECT_TRUE(Success);
26 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
27 EXPECT_EQ(Expected, Result);
28 }
29
TEST(ConvertUTFTest,ConvertUTF16BigEndianToUTF8String)30 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
31 // Src is the look of disapproval.
32 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
33 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
34 std::string Result;
35 bool Success = convertUTF16ToUTF8String(Ref, Result);
36 EXPECT_TRUE(Success);
37 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
38 EXPECT_EQ(Expected, Result);
39 }
40
TEST(ConvertUTFTest,ConvertUTF8ToUTF16String)41 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
42 // Src is the look of disapproval.
43 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
44 StringRef Ref(Src, sizeof(Src) - 1);
45 SmallVector<UTF16, 5> Result;
46 bool Success = convertUTF8ToUTF16String(Ref, Result);
47 EXPECT_TRUE(Success);
48 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
49 ASSERT_EQ(3u, Result.size());
50 for (int I = 0, E = 3; I != E; ++I)
51 EXPECT_EQ(Expected[I], Result[I]);
52 }
53
TEST(ConvertUTFTest,OddLengthInput)54 TEST(ConvertUTFTest, OddLengthInput) {
55 std::string Result;
56 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
57 EXPECT_FALSE(Success);
58 }
59
TEST(ConvertUTFTest,Empty)60 TEST(ConvertUTFTest, Empty) {
61 std::string Result;
62 bool Success = convertUTF16ToUTF8String(None, Result);
63 EXPECT_TRUE(Success);
64 EXPECT_TRUE(Result.empty());
65 }
66
TEST(ConvertUTFTest,HasUTF16BOM)67 TEST(ConvertUTFTest, HasUTF16BOM) {
68 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
69 EXPECT_TRUE(HasBOM);
70 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
71 EXPECT_TRUE(HasBOM);
72 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
73 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
74 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
75 EXPECT_TRUE(HasBOM);
76
77 HasBOM = hasUTF16ByteOrderMark(None);
78 EXPECT_FALSE(HasBOM);
79 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
80 EXPECT_FALSE(HasBOM);
81 }
82
83 struct ConvertUTFResultContainer {
84 ConversionResult ErrorCode;
85 std::vector<unsigned> UnicodeScalars;
86
ConvertUTFResultContainerConvertUTFResultContainer87 ConvertUTFResultContainer(ConversionResult ErrorCode)
88 : ErrorCode(ErrorCode) {}
89
90 ConvertUTFResultContainer
withScalarsConvertUTFResultContainer91 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
92 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
93 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
94 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
95 ConvertUTFResultContainer Result(*this);
96 if (US0 != 0x110000)
97 Result.UnicodeScalars.push_back(US0);
98 if (US1 != 0x110000)
99 Result.UnicodeScalars.push_back(US1);
100 if (US2 != 0x110000)
101 Result.UnicodeScalars.push_back(US2);
102 if (US3 != 0x110000)
103 Result.UnicodeScalars.push_back(US3);
104 if (US4 != 0x110000)
105 Result.UnicodeScalars.push_back(US4);
106 if (US5 != 0x110000)
107 Result.UnicodeScalars.push_back(US5);
108 if (US6 != 0x110000)
109 Result.UnicodeScalars.push_back(US6);
110 if (US7 != 0x110000)
111 Result.UnicodeScalars.push_back(US7);
112 return Result;
113 }
114 };
115
116 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsLenient(StringRef S)117 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
118 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
119
120 const UTF8 *SourceNext = SourceStart;
121 std::vector<UTF32> Decoded(S.size(), 0);
122 UTF32 *TargetStart = Decoded.data();
123
124 auto ErrorCode =
125 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
126 Decoded.data() + Decoded.size(), lenientConversion);
127
128 Decoded.resize(TargetStart - Decoded.data());
129
130 return std::make_pair(ErrorCode, Decoded);
131 }
132
133 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S)134 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
135 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
136
137 const UTF8 *SourceNext = SourceStart;
138 std::vector<UTF32> Decoded(S.size(), 0);
139 UTF32 *TargetStart = Decoded.data();
140
141 auto ErrorCode = ConvertUTF8toUTF32Partial(
142 &SourceNext, SourceStart + S.size(), &TargetStart,
143 Decoded.data() + Decoded.size(), lenientConversion);
144
145 Decoded.resize(TargetStart - Decoded.data());
146
147 return std::make_pair(ErrorCode, Decoded);
148 }
149
150 ::testing::AssertionResult
CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,StringRef S,bool Partial=false)151 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
152 StringRef S, bool Partial = false) {
153 ConversionResult ErrorCode;
154 std::vector<unsigned> Decoded;
155 if (!Partial)
156 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
157 else
158 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
159
160 if (Expected.ErrorCode != ErrorCode)
161 return ::testing::AssertionFailure() << "Expected error code "
162 << Expected.ErrorCode << ", actual "
163 << ErrorCode;
164
165 if (Expected.UnicodeScalars != Decoded)
166 return ::testing::AssertionFailure()
167 << "Expected lenient decoded result:\n"
168 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
169 << "Actual result:\n" << ::testing::PrintToString(Decoded);
170
171 return ::testing::AssertionSuccess();
172 }
173
TEST(ConvertUTFTest,UTF8ToUTF32Lenient)174 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
175
176 //
177 // 1-byte sequences
178 //
179
180 // U+0041 LATIN CAPITAL LETTER A
181 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
182 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
183
184 //
185 // 2-byte sequences
186 //
187
188 // U+0283 LATIN SMALL LETTER ESH
189 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
190 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
191 "\xca\x83"));
192
193 // U+03BA GREEK SMALL LETTER KAPPA
194 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
195 // U+03C3 GREEK SMALL LETTER SIGMA
196 // U+03BC GREEK SMALL LETTER MU
197 // U+03B5 GREEK SMALL LETTER EPSILON
198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
199 ConvertUTFResultContainer(conversionOK)
200 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
201 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
202
203 //
204 // 3-byte sequences
205 //
206
207 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
208 // U+6587 CJK UNIFIED IDEOGRAPH-6587
209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
210 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
211 "\xe4\xbe\x8b\xe6\x96\x87"));
212
213 // U+D55C HANGUL SYLLABLE HAN
214 // U+AE00 HANGUL SYLLABLE GEUL
215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
216 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
217 "\xed\x95\x9c\xea\xb8\x80"));
218
219 // U+1112 HANGUL CHOSEONG HIEUH
220 // U+1161 HANGUL JUNGSEONG A
221 // U+11AB HANGUL JONGSEONG NIEUN
222 // U+1100 HANGUL CHOSEONG KIYEOK
223 // U+1173 HANGUL JUNGSEONG EU
224 // U+11AF HANGUL JONGSEONG RIEUL
225 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
226 ConvertUTFResultContainer(conversionOK)
227 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
228 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
229 "\xe1\x86\xaf"));
230
231 //
232 // 4-byte sequences
233 //
234
235 // U+E0100 VARIATION SELECTOR-17
236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
238 "\xf3\xa0\x84\x80"));
239
240 //
241 // First possible sequence of a certain length
242 //
243
244 // U+0000 NULL
245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
246 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
247 StringRef("\x00", 1)));
248
249 // U+0080 PADDING CHARACTER
250 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
251 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
252 "\xc2\x80"));
253
254 // U+0800 SAMARITAN LETTER ALAF
255 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
256 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
257 "\xe0\xa0\x80"));
258
259 // U+10000 LINEAR B SYLLABLE B008 A
260 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
261 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
262 "\xf0\x90\x80\x80"));
263
264 // U+200000 (invalid)
265 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
266 ConvertUTFResultContainer(sourceIllegal)
267 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
268 "\xf8\x88\x80\x80\x80"));
269
270 // U+4000000 (invalid)
271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272 ConvertUTFResultContainer(sourceIllegal)
273 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
274 "\xfc\x84\x80\x80\x80\x80"));
275
276 //
277 // Last possible sequence of a certain length
278 //
279
280 // U+007F DELETE
281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
283
284 // U+07FF (unassigned)
285 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
286 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
287 "\xdf\xbf"));
288
289 // U+FFFF (noncharacter)
290 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
291 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
292 "\xef\xbf\xbf"));
293
294 // U+1FFFFF (invalid)
295 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
296 ConvertUTFResultContainer(sourceIllegal)
297 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
298 "\xf7\xbf\xbf\xbf"));
299
300 // U+3FFFFFF (invalid)
301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
302 ConvertUTFResultContainer(sourceIllegal)
303 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
304 "\xfb\xbf\xbf\xbf\xbf"));
305
306 // U+7FFFFFFF (invalid)
307 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
308 ConvertUTFResultContainer(sourceIllegal)
309 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
310 "\xfd\xbf\xbf\xbf\xbf\xbf"));
311
312 //
313 // Other boundary conditions
314 //
315
316 // U+D7FF (unassigned)
317 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
318 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
319 "\xed\x9f\xbf"));
320
321 // U+E000 (private use)
322 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
323 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
324 "\xee\x80\x80"));
325
326 // U+FFFD REPLACEMENT CHARACTER
327 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
328 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
329 "\xef\xbf\xbd"));
330
331 // U+10FFFF (noncharacter)
332 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
333 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
334 "\xf4\x8f\xbf\xbf"));
335
336 // U+110000 (invalid)
337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338 ConvertUTFResultContainer(sourceIllegal)
339 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
340 "\xf4\x90\x80\x80"));
341
342 //
343 // Unexpected continuation bytes
344 //
345
346 // A sequence of unexpected continuation bytes that don't follow a first
347 // byte, every byte is a maximal subpart.
348
349 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
350 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
352 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
355 "\x80\x80"));
356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
358 "\x80\xbf"));
359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
361 "\xbf\x80"));
362 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
363 ConvertUTFResultContainer(sourceIllegal)
364 .withScalars(0xfffd, 0xfffd, 0xfffd),
365 "\x80\xbf\x80"));
366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
367 ConvertUTFResultContainer(sourceIllegal)
368 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
369 "\x80\xbf\x80\xbf"));
370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371 ConvertUTFResultContainer(sourceIllegal)
372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
373 "\x80\xbf\x82\xbf\xaa"));
374 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
375 ConvertUTFResultContainer(sourceIllegal)
376 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
377 "\xaa\xb0\xbb\xbf\xaa\xa0"));
378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
379 ConvertUTFResultContainer(sourceIllegal)
380 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
381 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
382
383 // All continuation bytes (0x80--0xbf).
384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385 ConvertUTFResultContainer(sourceIllegal)
386 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387 0xfffd, 0xfffd, 0xfffd, 0xfffd)
388 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
389 0xfffd, 0xfffd, 0xfffd, 0xfffd)
390 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
391 0xfffd, 0xfffd, 0xfffd, 0xfffd)
392 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
393 0xfffd, 0xfffd, 0xfffd, 0xfffd)
394 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
395 0xfffd, 0xfffd, 0xfffd, 0xfffd)
396 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
397 0xfffd, 0xfffd, 0xfffd, 0xfffd)
398 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
399 0xfffd, 0xfffd, 0xfffd, 0xfffd)
400 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401 0xfffd, 0xfffd, 0xfffd, 0xfffd),
402 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
403 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
404 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
405 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
406
407 //
408 // Lonely start bytes
409 //
410
411 // Start bytes of 2-byte sequences (0xc0--0xdf).
412 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
413 ConvertUTFResultContainer(sourceIllegal)
414 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
415 0xfffd, 0xfffd, 0xfffd, 0xfffd)
416 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
417 0xfffd, 0xfffd, 0xfffd, 0xfffd)
418 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
419 0xfffd, 0xfffd, 0xfffd, 0xfffd)
420 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
421 0xfffd, 0xfffd, 0xfffd, 0xfffd),
422 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
423 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
424
425 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
426 ConvertUTFResultContainer(sourceIllegal)
427 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428 0xfffd, 0x0020, 0xfffd, 0x0020)
429 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
430 0xfffd, 0x0020, 0xfffd, 0x0020)
431 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
432 0xfffd, 0x0020, 0xfffd, 0x0020)
433 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
434 0xfffd, 0x0020, 0xfffd, 0x0020)
435 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
436 0xfffd, 0x0020, 0xfffd, 0x0020)
437 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
438 0xfffd, 0x0020, 0xfffd, 0x0020)
439 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
440 0xfffd, 0x0020, 0xfffd, 0x0020)
441 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
442 0xfffd, 0x0020, 0xfffd, 0x0020),
443 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
444 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
445 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
446 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
447
448 // Start bytes of 3-byte sequences (0xe0--0xef).
449 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
450 ConvertUTFResultContainer(sourceIllegal)
451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452 0xfffd, 0xfffd, 0xfffd, 0xfffd)
453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454 0xfffd, 0xfffd, 0xfffd, 0xfffd),
455 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
456
457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458 ConvertUTFResultContainer(sourceIllegal)
459 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
460 0xfffd, 0x0020, 0xfffd, 0x0020)
461 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
462 0xfffd, 0x0020, 0xfffd, 0x0020)
463 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
464 0xfffd, 0x0020, 0xfffd, 0x0020)
465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466 0xfffd, 0x0020, 0xfffd, 0x0020),
467 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
468 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
469
470 // Start bytes of 4-byte sequences (0xf0--0xf7).
471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
472 ConvertUTFResultContainer(sourceIllegal)
473 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
474 0xfffd, 0xfffd, 0xfffd, 0xfffd),
475 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
476
477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478 ConvertUTFResultContainer(sourceIllegal)
479 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480 0xfffd, 0x0020, 0xfffd, 0x0020)
481 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
482 0xfffd, 0x0020, 0xfffd, 0x0020),
483 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
484
485 // Start bytes of 5-byte sequences (0xf8--0xfb).
486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
487 ConvertUTFResultContainer(sourceIllegal)
488 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
489 "\xf8\xf9\xfa\xfb"));
490
491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
492 ConvertUTFResultContainer(sourceIllegal)
493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494 0xfffd, 0x0020, 0xfffd, 0x0020),
495 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
496
497 // Start bytes of 6-byte sequences (0xfc--0xfd).
498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
499 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
500 "\xfc\xfd"));
501
502 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
503 ConvertUTFResultContainer(sourceIllegal)
504 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
505 "\xfc\x20\xfd\x20"));
506
507 //
508 // Other bytes (0xc0--0xc1, 0xfe--0xff).
509 //
510
511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
514 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
517 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
518 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
519
520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
521 ConvertUTFResultContainer(sourceIllegal)
522 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
523 "\xc0\xc1\xfe\xff"));
524
525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
526 ConvertUTFResultContainer(sourceIllegal)
527 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
528 "\xfe\xfe\xff\xff"));
529
530 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
531 ConvertUTFResultContainer(sourceIllegal)
532 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533 "\xfe\x80\x80\x80\x80\x80"));
534
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal)
537 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
538 "\xff\x80\x80\x80\x80\x80"));
539
540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541 ConvertUTFResultContainer(sourceIllegal)
542 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
543 0xfffd, 0x0020, 0xfffd, 0x0020),
544 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
545
546 //
547 // Sequences with one continuation byte missing
548 //
549
550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
551 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
555 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
556 "\xe0\xa0"));
557 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
558 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
559 "\xe0\xbf"));
560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
562 "\xe1\x80"));
563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
564 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
565 "\xec\xbf"));
566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
567 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
568 "\xed\x80"));
569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
571 "\xed\x9f"));
572 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
573 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
574 "\xee\x80"));
575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
577 "\xef\xbf"));
578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
580 "\xf0\x90\x80"));
581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
582 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
583 "\xf0\xbf\xbf"));
584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586 "\xf1\x80\x80"));
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589 "\xf3\xbf\xbf"));
590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
592 "\xf4\x80\x80"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
595 "\xf4\x8f\xbf"));
596
597 // Overlong sequences with one trailing byte missing.
598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
600 "\xc0"));
601 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
602 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
603 "\xc1"));
604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
606 "\xe0\x80"));
607 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
608 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
609 "\xe0\x9f"));
610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611 ConvertUTFResultContainer(sourceIllegal)
612 .withScalars(0xfffd, 0xfffd, 0xfffd),
613 "\xf0\x80\x80"));
614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615 ConvertUTFResultContainer(sourceIllegal)
616 .withScalars(0xfffd, 0xfffd, 0xfffd),
617 "\xf0\x8f\x80"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal)
620 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
621 "\xf8\x80\x80\x80"));
622 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
623 ConvertUTFResultContainer(sourceIllegal)
624 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
625 "\xfc\x80\x80\x80\x80"));
626
627 // Sequences that represent surrogates with one trailing byte missing.
628 // High surrogates
629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
630 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
631 "\xed\xa0"));
632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
634 "\xed\xac"));
635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
637 "\xed\xaf"));
638 // Low surrogates
639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
641 "\xed\xb0"));
642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
644 "\xed\xb4"));
645 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
647 "\xed\xbf"));
648
649 // Ill-formed 4-byte sequences.
650 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
651 // U+1100xx (invalid)
652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
653 ConvertUTFResultContainer(sourceIllegal)
654 .withScalars(0xfffd, 0xfffd, 0xfffd),
655 "\xf4\x90\x80"));
656 // U+13FBxx (invalid)
657 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
658 ConvertUTFResultContainer(sourceIllegal)
659 .withScalars(0xfffd, 0xfffd, 0xfffd),
660 "\xf4\xbf\xbf"));
661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
662 ConvertUTFResultContainer(sourceIllegal)
663 .withScalars(0xfffd, 0xfffd, 0xfffd),
664 "\xf5\x80\x80"));
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal)
667 .withScalars(0xfffd, 0xfffd, 0xfffd),
668 "\xf6\x80\x80"));
669 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
670 ConvertUTFResultContainer(sourceIllegal)
671 .withScalars(0xfffd, 0xfffd, 0xfffd),
672 "\xf7\x80\x80"));
673 // U+1FFBxx (invalid)
674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
675 ConvertUTFResultContainer(sourceIllegal)
676 .withScalars(0xfffd, 0xfffd, 0xfffd),
677 "\xf7\xbf\xbf"));
678
679 // Ill-formed 5-byte sequences.
680 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
681 // U+2000xx (invalid)
682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
683 ConvertUTFResultContainer(sourceIllegal)
684 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
685 "\xf8\x88\x80\x80"));
686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687 ConvertUTFResultContainer(sourceIllegal)
688 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
689 "\xf8\xbf\xbf\xbf"));
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal)
692 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
693 "\xf9\x80\x80\x80"));
694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
695 ConvertUTFResultContainer(sourceIllegal)
696 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
697 "\xfa\x80\x80\x80"));
698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
699 ConvertUTFResultContainer(sourceIllegal)
700 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
701 "\xfb\x80\x80\x80"));
702 // U+3FFFFxx (invalid)
703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704 ConvertUTFResultContainer(sourceIllegal)
705 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
706 "\xfb\xbf\xbf\xbf"));
707
708 // Ill-formed 6-byte sequences.
709 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
710 // U+40000xx (invalid)
711 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
712 ConvertUTFResultContainer(sourceIllegal)
713 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
714 "\xfc\x84\x80\x80\x80"));
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal)
717 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
718 "\xfc\xbf\xbf\xbf\xbf"));
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal)
721 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
722 "\xfd\x80\x80\x80\x80"));
723 // U+7FFFFFxx (invalid)
724 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
725 ConvertUTFResultContainer(sourceIllegal)
726 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
727 "\xfd\xbf\xbf\xbf\xbf"));
728
729 //
730 // Sequences with two continuation bytes missing
731 //
732
733 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
734 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
735 "\xf0\x90"));
736 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
737 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
738 "\xf0\xbf"));
739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
741 "\xf1\x80"));
742 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
743 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
744 "\xf3\xbf"));
745 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
746 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
747 "\xf4\x80"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
750 "\xf4\x8f"));
751
752 // Overlong sequences with two trailing byte missing.
753 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
754 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
755 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
756 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
757 "\xf0\x80"));
758 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
759 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
760 "\xf0\x8f"));
761 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762 ConvertUTFResultContainer(sourceIllegal)
763 .withScalars(0xfffd, 0xfffd, 0xfffd),
764 "\xf8\x80\x80"));
765 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
766 ConvertUTFResultContainer(sourceIllegal)
767 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
768 "\xfc\x80\x80\x80"));
769
770 // Sequences that represent surrogates with two trailing bytes missing.
771 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
772 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
773
774 // Ill-formed 4-byte sequences.
775 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
776 // U+110yxx (invalid)
777 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
779 "\xf4\x90"));
780 // U+13Fyxx (invalid)
781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
783 "\xf4\xbf"));
784 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
785 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
786 "\xf5\x80"));
787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
789 "\xf6\x80"));
790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
792 "\xf7\x80"));
793 // U+1FFyxx (invalid)
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
796 "\xf7\xbf"));
797
798 // Ill-formed 5-byte sequences.
799 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
800 // U+200yxx (invalid)
801 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
802 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
803 "\xf8\x88\x80"));
804 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
805 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
806 "\xf8\xbf\xbf"));
807 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
808 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
809 "\xf9\x80\x80"));
810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
812 "\xfa\x80\x80"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
815 "\xfb\x80\x80"));
816 // U+3FFFyxx (invalid)
817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
818 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
819 "\xfb\xbf\xbf"));
820
821 // Ill-formed 6-byte sequences.
822 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
823 // U+4000yxx (invalid)
824 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
825 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826 "\xfc\x84\x80\x80"));
827 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
828 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
829 "\xfc\xbf\xbf\xbf"));
830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
832 "\xfd\x80\x80\x80"));
833 // U+7FFFFyxx (invalid)
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
836 "\xfd\xbf\xbf\xbf"));
837
838 //
839 // Sequences with three continuation bytes missing
840 //
841
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
844 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
847 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
848 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
850 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
851 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
852
853 // Broken overlong sequences.
854 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
856 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
857 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
858 "\xf8\x80"));
859 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
861 "\xfc\x80\x80"));
862
863 // Ill-formed 4-byte sequences.
864 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
865 // U+14yyxx (invalid)
866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
870 // U+1Cyyxx (invalid)
871 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
873
874 // Ill-formed 5-byte sequences.
875 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
876 // U+20yyxx (invalid)
877 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
878 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
879 "\xf8\x88"));
880 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
881 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
882 "\xf8\xbf"));
883 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
884 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
885 "\xf9\x80"));
886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
888 "\xfa\x80"));
889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
891 "\xfb\x80"));
892 // U+3FCyyxx (invalid)
893 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
894 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
895 "\xfb\xbf"));
896
897 // Ill-formed 6-byte sequences.
898 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
899 // U+400yyxx (invalid)
900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
902 "\xfc\x84\x80"));
903 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
904 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
905 "\xfc\xbf\xbf"));
906 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
908 "\xfd\x80\x80"));
909 // U+7FFCyyxx (invalid)
910 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
911 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
912 "\xfd\xbf\xbf"));
913
914 //
915 // Sequences with four continuation bytes missing
916 //
917
918 // Ill-formed 5-byte sequences.
919 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
920 // U+uzyyxx (invalid)
921 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
922 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
923 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
925 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
926 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
927 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
928 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
929 // U+3zyyxx (invalid)
930 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
931 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
932
933 // Broken overlong sequences.
934 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
937 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
938 "\xfc\x80"));
939
940 // Ill-formed 6-byte sequences.
941 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
942 // U+uzzyyxx (invalid)
943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
945 "\xfc\x84"));
946 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
947 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
948 "\xfc\xbf"));
949 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
950 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
951 "\xfd\x80"));
952 // U+7Fzzyyxx (invalid)
953 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
955 "\xfd\xbf"));
956
957 //
958 // Sequences with five continuation bytes missing
959 //
960
961 // Ill-formed 6-byte sequences.
962 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
963 // U+uzzyyxx (invalid)
964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
966 // U+uuzzyyxx (invalid)
967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
969
970 //
971 // Consecutive sequences with trailing bytes missing
972 //
973
974 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
975 ConvertUTFResultContainer(sourceIllegal)
976 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
977 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
978 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
979 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
980 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
981 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
982 "\xc0" "\xe0\x80" "\xf0\x80\x80"
983 "\xf8\x80\x80\x80"
984 "\xfc\x80\x80\x80\x80"
985 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
986 "\xfb\xbf\xbf\xbf"
987 "\xfd\xbf\xbf\xbf\xbf"));
988
989 //
990 // Overlong UTF-8 sequences
991 //
992
993 // U+002F SOLIDUS
994 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
996
997 // Overlong sequences of the above.
998 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
999 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1000 "\xc0\xaf"));
1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002 ConvertUTFResultContainer(sourceIllegal)
1003 .withScalars(0xfffd, 0xfffd, 0xfffd),
1004 "\xe0\x80\xaf"));
1005 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006 ConvertUTFResultContainer(sourceIllegal)
1007 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1008 "\xf0\x80\x80\xaf"));
1009 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1010 ConvertUTFResultContainer(sourceIllegal)
1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1012 "\xf8\x80\x80\x80\xaf"));
1013 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014 ConvertUTFResultContainer(sourceIllegal)
1015 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1016 "\xfc\x80\x80\x80\x80\xaf"));
1017
1018 // U+0000 NULL
1019 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1020 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1021 StringRef("\x00", 1)));
1022
1023 // Overlong sequences of the above.
1024 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1025 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1026 "\xc0\x80"));
1027 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1028 ConvertUTFResultContainer(sourceIllegal)
1029 .withScalars(0xfffd, 0xfffd, 0xfffd),
1030 "\xe0\x80\x80"));
1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal)
1033 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1034 "\xf0\x80\x80\x80"));
1035 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1036 ConvertUTFResultContainer(sourceIllegal)
1037 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1038 "\xf8\x80\x80\x80\x80"));
1039 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1040 ConvertUTFResultContainer(sourceIllegal)
1041 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1042 "\xfc\x80\x80\x80\x80\x80"));
1043
1044 // Other overlong sequences.
1045 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1046 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1047 "\xc0\xbf"));
1048 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1050 "\xc1\x80"));
1051 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1052 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1053 "\xc1\xbf"));
1054 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1055 ConvertUTFResultContainer(sourceIllegal)
1056 .withScalars(0xfffd, 0xfffd, 0xfffd),
1057 "\xe0\x9f\xbf"));
1058 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059 ConvertUTFResultContainer(sourceIllegal)
1060 .withScalars(0xfffd, 0xfffd, 0xfffd),
1061 "\xed\xa0\x80"));
1062 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1063 ConvertUTFResultContainer(sourceIllegal)
1064 .withScalars(0xfffd, 0xfffd, 0xfffd),
1065 "\xed\xbf\xbf"));
1066 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1067 ConvertUTFResultContainer(sourceIllegal)
1068 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1069 "\xf0\x8f\x80\x80"));
1070 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1071 ConvertUTFResultContainer(sourceIllegal)
1072 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1073 "\xf0\x8f\xbf\xbf"));
1074 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1075 ConvertUTFResultContainer(sourceIllegal)
1076 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1077 "\xf8\x87\xbf\xbf\xbf"));
1078 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1079 ConvertUTFResultContainer(sourceIllegal)
1080 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1081 "\xfc\x83\xbf\xbf\xbf\xbf"));
1082
1083 //
1084 // Isolated surrogates
1085 //
1086
1087 // Unicode 6.3.0:
1088 //
1089 // D71. High-surrogate code point: A Unicode code point in the range
1090 // U+D800 to U+DBFF.
1091 //
1092 // D73. Low-surrogate code point: A Unicode code point in the range
1093 // U+DC00 to U+DFFF.
1094
1095 // Note: U+E0100 is <DB40 DD00> in UTF16.
1096
1097 // High surrogates
1098
1099 // U+D800
1100 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1101 ConvertUTFResultContainer(sourceIllegal)
1102 .withScalars(0xfffd, 0xfffd, 0xfffd),
1103 "\xed\xa0\x80"));
1104
1105 // U+DB40
1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal)
1108 .withScalars(0xfffd, 0xfffd, 0xfffd),
1109 "\xed\xac\xa0"));
1110
1111 // U+DBFF
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1115 "\xed\xaf\xbf"));
1116
1117 // Low surrogates
1118
1119 // U+DC00
1120 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121 ConvertUTFResultContainer(sourceIllegal)
1122 .withScalars(0xfffd, 0xfffd, 0xfffd),
1123 "\xed\xb0\x80"));
1124
1125 // U+DD00
1126 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127 ConvertUTFResultContainer(sourceIllegal)
1128 .withScalars(0xfffd, 0xfffd, 0xfffd),
1129 "\xed\xb4\x80"));
1130
1131 // U+DFFF
1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd),
1135 "\xed\xbf\xbf"));
1136
1137 // Surrogate pairs
1138
1139 // U+D800 U+DC00
1140 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1141 ConvertUTFResultContainer(sourceIllegal)
1142 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1143 "\xed\xa0\x80\xed\xb0\x80"));
1144
1145 // U+D800 U+DD00
1146 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1147 ConvertUTFResultContainer(sourceIllegal)
1148 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1149 "\xed\xa0\x80\xed\xb4\x80"));
1150
1151 // U+D800 U+DFFF
1152 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1153 ConvertUTFResultContainer(sourceIllegal)
1154 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1155 "\xed\xa0\x80\xed\xbf\xbf"));
1156
1157 // U+DB40 U+DC00
1158 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159 ConvertUTFResultContainer(sourceIllegal)
1160 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1161 "\xed\xac\xa0\xed\xb0\x80"));
1162
1163 // U+DB40 U+DD00
1164 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165 ConvertUTFResultContainer(sourceIllegal)
1166 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1167 "\xed\xac\xa0\xed\xb4\x80"));
1168
1169 // U+DB40 U+DFFF
1170 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171 ConvertUTFResultContainer(sourceIllegal)
1172 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1173 "\xed\xac\xa0\xed\xbf\xbf"));
1174
1175 // U+DBFF U+DC00
1176 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1177 ConvertUTFResultContainer(sourceIllegal)
1178 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1179 "\xed\xaf\xbf\xed\xb0\x80"));
1180
1181 // U+DBFF U+DD00
1182 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1183 ConvertUTFResultContainer(sourceIllegal)
1184 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1185 "\xed\xaf\xbf\xed\xb4\x80"));
1186
1187 // U+DBFF U+DFFF
1188 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1189 ConvertUTFResultContainer(sourceIllegal)
1190 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1191 "\xed\xaf\xbf\xed\xbf\xbf"));
1192
1193 //
1194 // Noncharacters
1195 //
1196
1197 // Unicode 6.3.0:
1198 //
1199 // D14. Noncharacter: A code point that is permanently reserved for
1200 // internal use and that should never be interchanged. Noncharacters
1201 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1202 // and the values U+FDD0..U+FDEF.
1203
1204 // U+FFFE
1205 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1206 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1207 "\xef\xbf\xbe"));
1208
1209 // U+FFFF
1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1212 "\xef\xbf\xbf"));
1213
1214 // U+1FFFE
1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1217 "\xf0\x9f\xbf\xbe"));
1218
1219 // U+1FFFF
1220 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1221 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1222 "\xf0\x9f\xbf\xbf"));
1223
1224 // U+2FFFE
1225 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1226 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1227 "\xf0\xaf\xbf\xbe"));
1228
1229 // U+2FFFF
1230 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1231 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1232 "\xf0\xaf\xbf\xbf"));
1233
1234 // U+3FFFE
1235 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1236 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1237 "\xf0\xbf\xbf\xbe"));
1238
1239 // U+3FFFF
1240 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1242 "\xf0\xbf\xbf\xbf"));
1243
1244 // U+4FFFE
1245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1246 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1247 "\xf1\x8f\xbf\xbe"));
1248
1249 // U+4FFFF
1250 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1251 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1252 "\xf1\x8f\xbf\xbf"));
1253
1254 // U+5FFFE
1255 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1256 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1257 "\xf1\x9f\xbf\xbe"));
1258
1259 // U+5FFFF
1260 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1261 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1262 "\xf1\x9f\xbf\xbf"));
1263
1264 // U+6FFFE
1265 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1266 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1267 "\xf1\xaf\xbf\xbe"));
1268
1269 // U+6FFFF
1270 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1271 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1272 "\xf1\xaf\xbf\xbf"));
1273
1274 // U+7FFFE
1275 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1276 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1277 "\xf1\xbf\xbf\xbe"));
1278
1279 // U+7FFFF
1280 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1281 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1282 "\xf1\xbf\xbf\xbf"));
1283
1284 // U+8FFFE
1285 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1286 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1287 "\xf2\x8f\xbf\xbe"));
1288
1289 // U+8FFFF
1290 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1291 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1292 "\xf2\x8f\xbf\xbf"));
1293
1294 // U+9FFFE
1295 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1296 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1297 "\xf2\x9f\xbf\xbe"));
1298
1299 // U+9FFFF
1300 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1301 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1302 "\xf2\x9f\xbf\xbf"));
1303
1304 // U+AFFFE
1305 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1306 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1307 "\xf2\xaf\xbf\xbe"));
1308
1309 // U+AFFFF
1310 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1311 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1312 "\xf2\xaf\xbf\xbf"));
1313
1314 // U+BFFFE
1315 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1316 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1317 "\xf2\xbf\xbf\xbe"));
1318
1319 // U+BFFFF
1320 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1321 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1322 "\xf2\xbf\xbf\xbf"));
1323
1324 // U+CFFFE
1325 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1326 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1327 "\xf3\x8f\xbf\xbe"));
1328
1329 // U+CFFFF
1330 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1331 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1332 "\xf3\x8f\xbf\xbf"));
1333
1334 // U+DFFFE
1335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1336 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1337 "\xf3\x9f\xbf\xbe"));
1338
1339 // U+DFFFF
1340 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1341 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1342 "\xf3\x9f\xbf\xbf"));
1343
1344 // U+EFFFE
1345 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1346 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1347 "\xf3\xaf\xbf\xbe"));
1348
1349 // U+EFFFF
1350 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1351 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1352 "\xf3\xaf\xbf\xbf"));
1353
1354 // U+FFFFE
1355 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1356 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1357 "\xf3\xbf\xbf\xbe"));
1358
1359 // U+FFFFF
1360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1361 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1362 "\xf3\xbf\xbf\xbf"));
1363
1364 // U+10FFFE
1365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1366 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1367 "\xf4\x8f\xbf\xbe"));
1368
1369 // U+10FFFF
1370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1371 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1372 "\xf4\x8f\xbf\xbf"));
1373
1374 // U+FDD0
1375 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1376 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1377 "\xef\xb7\x90"));
1378
1379 // U+FDD1
1380 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1381 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1382 "\xef\xb7\x91"));
1383
1384 // U+FDD2
1385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1386 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1387 "\xef\xb7\x92"));
1388
1389 // U+FDD3
1390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1391 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1392 "\xef\xb7\x93"));
1393
1394 // U+FDD4
1395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1396 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1397 "\xef\xb7\x94"));
1398
1399 // U+FDD5
1400 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1401 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1402 "\xef\xb7\x95"));
1403
1404 // U+FDD6
1405 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1406 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1407 "\xef\xb7\x96"));
1408
1409 // U+FDD7
1410 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1411 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1412 "\xef\xb7\x97"));
1413
1414 // U+FDD8
1415 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1416 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1417 "\xef\xb7\x98"));
1418
1419 // U+FDD9
1420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1421 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1422 "\xef\xb7\x99"));
1423
1424 // U+FDDA
1425 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1426 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1427 "\xef\xb7\x9a"));
1428
1429 // U+FDDB
1430 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1431 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1432 "\xef\xb7\x9b"));
1433
1434 // U+FDDC
1435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1436 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1437 "\xef\xb7\x9c"));
1438
1439 // U+FDDD
1440 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1441 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1442 "\xef\xb7\x9d"));
1443
1444 // U+FDDE
1445 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1446 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1447 "\xef\xb7\x9e"));
1448
1449 // U+FDDF
1450 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1451 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1452 "\xef\xb7\x9f"));
1453
1454 // U+FDE0
1455 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1456 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1457 "\xef\xb7\xa0"));
1458
1459 // U+FDE1
1460 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1461 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1462 "\xef\xb7\xa1"));
1463
1464 // U+FDE2
1465 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1466 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1467 "\xef\xb7\xa2"));
1468
1469 // U+FDE3
1470 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1471 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1472 "\xef\xb7\xa3"));
1473
1474 // U+FDE4
1475 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1476 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1477 "\xef\xb7\xa4"));
1478
1479 // U+FDE5
1480 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1481 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1482 "\xef\xb7\xa5"));
1483
1484 // U+FDE6
1485 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1486 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1487 "\xef\xb7\xa6"));
1488
1489 // U+FDE7
1490 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1491 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1492 "\xef\xb7\xa7"));
1493
1494 // U+FDE8
1495 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1496 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1497 "\xef\xb7\xa8"));
1498
1499 // U+FDE9
1500 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1501 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1502 "\xef\xb7\xa9"));
1503
1504 // U+FDEA
1505 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1506 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1507 "\xef\xb7\xaa"));
1508
1509 // U+FDEB
1510 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1511 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1512 "\xef\xb7\xab"));
1513
1514 // U+FDEC
1515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1516 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1517 "\xef\xb7\xac"));
1518
1519 // U+FDED
1520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1521 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1522 "\xef\xb7\xad"));
1523
1524 // U+FDEE
1525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1526 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1527 "\xef\xb7\xae"));
1528
1529 // U+FDEF
1530 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1531 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1532 "\xef\xb7\xaf"));
1533
1534 // U+FDF0
1535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1536 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1537 "\xef\xb7\xb0"));
1538
1539 // U+FDF1
1540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1541 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1542 "\xef\xb7\xb1"));
1543
1544 // U+FDF2
1545 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1546 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1547 "\xef\xb7\xb2"));
1548
1549 // U+FDF3
1550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1551 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1552 "\xef\xb7\xb3"));
1553
1554 // U+FDF4
1555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1556 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1557 "\xef\xb7\xb4"));
1558
1559 // U+FDF5
1560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1561 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1562 "\xef\xb7\xb5"));
1563
1564 // U+FDF6
1565 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1566 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1567 "\xef\xb7\xb6"));
1568
1569 // U+FDF7
1570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1571 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1572 "\xef\xb7\xb7"));
1573
1574 // U+FDF8
1575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1576 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1577 "\xef\xb7\xb8"));
1578
1579 // U+FDF9
1580 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1581 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1582 "\xef\xb7\xb9"));
1583
1584 // U+FDFA
1585 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1586 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1587 "\xef\xb7\xba"));
1588
1589 // U+FDFB
1590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1591 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1592 "\xef\xb7\xbb"));
1593
1594 // U+FDFC
1595 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1596 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1597 "\xef\xb7\xbc"));
1598
1599 // U+FDFD
1600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1601 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1602 "\xef\xb7\xbd"));
1603
1604 // U+FDFE
1605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1606 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1607 "\xef\xb7\xbe"));
1608
1609 // U+FDFF
1610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1611 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1612 "\xef\xb7\xbf"));
1613 }
1614
TEST(ConvertUTFTest,UTF8ToUTF32PartialLenient)1615 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1616 // U+0041 LATIN CAPITAL LETTER A
1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1619 "\x41", true));
1620
1621 //
1622 // Sequences with one continuation byte missing
1623 //
1624
1625 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1626 ConvertUTFResultContainer(sourceExhausted),
1627 "\xc2", true));
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(sourceExhausted),
1630 "\xdf", true));
1631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1632 ConvertUTFResultContainer(sourceExhausted),
1633 "\xe0\xa0", true));
1634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1635 ConvertUTFResultContainer(sourceExhausted),
1636 "\xe0\xbf", true));
1637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1638 ConvertUTFResultContainer(sourceExhausted),
1639 "\xe1\x80", true));
1640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1641 ConvertUTFResultContainer(sourceExhausted),
1642 "\xec\xbf", true));
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(sourceExhausted),
1645 "\xed\x80", true));
1646 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1647 ConvertUTFResultContainer(sourceExhausted),
1648 "\xed\x9f", true));
1649 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1650 ConvertUTFResultContainer(sourceExhausted),
1651 "\xee\x80", true));
1652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1653 ConvertUTFResultContainer(sourceExhausted),
1654 "\xef\xbf", true));
1655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1656 ConvertUTFResultContainer(sourceExhausted),
1657 "\xf0\x90\x80", true));
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(sourceExhausted),
1660 "\xf0\xbf\xbf", true));
1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1662 ConvertUTFResultContainer(sourceExhausted),
1663 "\xf1\x80\x80", true));
1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1665 ConvertUTFResultContainer(sourceExhausted),
1666 "\xf3\xbf\xbf", true));
1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668 ConvertUTFResultContainer(sourceExhausted),
1669 "\xf4\x80\x80", true));
1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1671 ConvertUTFResultContainer(sourceExhausted),
1672 "\xf4\x8f\xbf", true));
1673
1674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1676 "\x41\xc2", true));
1677 }
1678
1679