1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/Support/Format.h"
12 #include "gtest/gtest.h"
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 using namespace llvm;
18 
TEST(ConvertUTFTest,ConvertUTF16LittleEndianToUTF8String)19 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
20   // Src is the look of disapproval.
21   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
22   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
23   std::string Result;
24   bool Success = convertUTF16ToUTF8String(Ref, Result);
25   EXPECT_TRUE(Success);
26   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
27   EXPECT_EQ(Expected, Result);
28 }
29 
TEST(ConvertUTFTest,ConvertUTF16BigEndianToUTF8String)30 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
31   // Src is the look of disapproval.
32   static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
33   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
34   std::string Result;
35   bool Success = convertUTF16ToUTF8String(Ref, Result);
36   EXPECT_TRUE(Success);
37   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
38   EXPECT_EQ(Expected, Result);
39 }
40 
TEST(ConvertUTFTest,ConvertUTF8ToUTF16String)41 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
42   // Src is the look of disapproval.
43   static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
44   StringRef Ref(Src, sizeof(Src) - 1);
45   SmallVector<UTF16, 5> Result;
46   bool Success = convertUTF8ToUTF16String(Ref, Result);
47   EXPECT_TRUE(Success);
48   static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
49   ASSERT_EQ(3u, Result.size());
50   for (int I = 0, E = 3; I != E; ++I)
51     EXPECT_EQ(Expected[I], Result[I]);
52 }
53 
TEST(ConvertUTFTest,OddLengthInput)54 TEST(ConvertUTFTest, OddLengthInput) {
55   std::string Result;
56   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
57   EXPECT_FALSE(Success);
58 }
59 
TEST(ConvertUTFTest,Empty)60 TEST(ConvertUTFTest, Empty) {
61   std::string Result;
62   bool Success = convertUTF16ToUTF8String(None, Result);
63   EXPECT_TRUE(Success);
64   EXPECT_TRUE(Result.empty());
65 }
66 
TEST(ConvertUTFTest,HasUTF16BOM)67 TEST(ConvertUTFTest, HasUTF16BOM) {
68   bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
69   EXPECT_TRUE(HasBOM);
70   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
71   EXPECT_TRUE(HasBOM);
72   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
73   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
74   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
75   EXPECT_TRUE(HasBOM);
76 
77   HasBOM = hasUTF16ByteOrderMark(None);
78   EXPECT_FALSE(HasBOM);
79   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
80   EXPECT_FALSE(HasBOM);
81 }
82 
83 struct ConvertUTFResultContainer {
84   ConversionResult ErrorCode;
85   std::vector<unsigned> UnicodeScalars;
86 
ConvertUTFResultContainerConvertUTFResultContainer87   ConvertUTFResultContainer(ConversionResult ErrorCode)
88       : ErrorCode(ErrorCode) {}
89 
90   ConvertUTFResultContainer
withScalarsConvertUTFResultContainer91   withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
92               unsigned US2 = 0x110000, unsigned US3 = 0x110000,
93               unsigned US4 = 0x110000, unsigned US5 = 0x110000,
94               unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
95     ConvertUTFResultContainer Result(*this);
96     if (US0 != 0x110000)
97       Result.UnicodeScalars.push_back(US0);
98     if (US1 != 0x110000)
99       Result.UnicodeScalars.push_back(US1);
100     if (US2 != 0x110000)
101       Result.UnicodeScalars.push_back(US2);
102     if (US3 != 0x110000)
103       Result.UnicodeScalars.push_back(US3);
104     if (US4 != 0x110000)
105       Result.UnicodeScalars.push_back(US4);
106     if (US5 != 0x110000)
107       Result.UnicodeScalars.push_back(US5);
108     if (US6 != 0x110000)
109       Result.UnicodeScalars.push_back(US6);
110     if (US7 != 0x110000)
111       Result.UnicodeScalars.push_back(US7);
112     return Result;
113   }
114 };
115 
116 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsLenient(StringRef S)117 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
118   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
119 
120   const UTF8 *SourceNext = SourceStart;
121   std::vector<UTF32> Decoded(S.size(), 0);
122   UTF32 *TargetStart = Decoded.data();
123 
124   auto ErrorCode =
125       ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
126                          Decoded.data() + Decoded.size(), lenientConversion);
127 
128   Decoded.resize(TargetStart - Decoded.data());
129 
130   return std::make_pair(ErrorCode, Decoded);
131 }
132 
133 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S)134 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
135   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
136 
137   const UTF8 *SourceNext = SourceStart;
138   std::vector<UTF32> Decoded(S.size(), 0);
139   UTF32 *TargetStart = Decoded.data();
140 
141   auto ErrorCode = ConvertUTF8toUTF32Partial(
142       &SourceNext, SourceStart + S.size(), &TargetStart,
143       Decoded.data() + Decoded.size(), lenientConversion);
144 
145   Decoded.resize(TargetStart - Decoded.data());
146 
147   return std::make_pair(ErrorCode, Decoded);
148 }
149 
150 ::testing::AssertionResult
CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,StringRef S,bool Partial=false)151 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
152                                  StringRef S, bool Partial = false) {
153   ConversionResult ErrorCode;
154   std::vector<unsigned> Decoded;
155   if (!Partial)
156     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
157   else
158     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
159 
160   if (Expected.ErrorCode != ErrorCode)
161     return ::testing::AssertionFailure() << "Expected error code "
162                                          << Expected.ErrorCode << ", actual "
163                                          << ErrorCode;
164 
165   if (Expected.UnicodeScalars != Decoded)
166     return ::testing::AssertionFailure()
167            << "Expected lenient decoded result:\n"
168            << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
169            << "Actual result:\n" << ::testing::PrintToString(Decoded);
170 
171   return ::testing::AssertionSuccess();
172 }
173 
TEST(ConvertUTFTest,UTF8ToUTF32Lenient)174 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
175 
176   //
177   // 1-byte sequences
178   //
179 
180   // U+0041 LATIN CAPITAL LETTER A
181   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
182       ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
183 
184   //
185   // 2-byte sequences
186   //
187 
188   // U+0283 LATIN SMALL LETTER ESH
189   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
190       ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
191       "\xca\x83"));
192 
193   // U+03BA GREEK SMALL LETTER KAPPA
194   // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
195   // U+03C3 GREEK SMALL LETTER SIGMA
196   // U+03BC GREEK SMALL LETTER MU
197   // U+03B5 GREEK SMALL LETTER EPSILON
198   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
199       ConvertUTFResultContainer(conversionOK)
200           .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
201       "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
202 
203   //
204   // 3-byte sequences
205   //
206 
207   // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
208   // U+6587 CJK UNIFIED IDEOGRAPH-6587
209   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
210       ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
211       "\xe4\xbe\x8b\xe6\x96\x87"));
212 
213   // U+D55C HANGUL SYLLABLE HAN
214   // U+AE00 HANGUL SYLLABLE GEUL
215   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
216       ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
217       "\xed\x95\x9c\xea\xb8\x80"));
218 
219   // U+1112 HANGUL CHOSEONG HIEUH
220   // U+1161 HANGUL JUNGSEONG A
221   // U+11AB HANGUL JONGSEONG NIEUN
222   // U+1100 HANGUL CHOSEONG KIYEOK
223   // U+1173 HANGUL JUNGSEONG EU
224   // U+11AF HANGUL JONGSEONG RIEUL
225   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
226       ConvertUTFResultContainer(conversionOK)
227           .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
228       "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
229       "\xe1\x86\xaf"));
230 
231   //
232   // 4-byte sequences
233   //
234 
235   // U+E0100 VARIATION SELECTOR-17
236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237       ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
238       "\xf3\xa0\x84\x80"));
239 
240   //
241   // First possible sequence of a certain length
242   //
243 
244   // U+0000 NULL
245   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
246       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
247       StringRef("\x00", 1)));
248 
249   // U+0080 PADDING CHARACTER
250   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
251       ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
252       "\xc2\x80"));
253 
254   // U+0800 SAMARITAN LETTER ALAF
255   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
256       ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
257       "\xe0\xa0\x80"));
258 
259   // U+10000 LINEAR B SYLLABLE B008 A
260   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
261       ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
262       "\xf0\x90\x80\x80"));
263 
264   // U+200000 (invalid)
265   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
266       ConvertUTFResultContainer(sourceIllegal)
267           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
268       "\xf8\x88\x80\x80\x80"));
269 
270   // U+4000000 (invalid)
271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272       ConvertUTFResultContainer(sourceIllegal)
273           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
274       "\xfc\x84\x80\x80\x80\x80"));
275 
276   //
277   // Last possible sequence of a certain length
278   //
279 
280   // U+007F DELETE
281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282       ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
283 
284   // U+07FF (unassigned)
285   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
286       ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
287       "\xdf\xbf"));
288 
289   // U+FFFF (noncharacter)
290   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
291       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
292       "\xef\xbf\xbf"));
293 
294   // U+1FFFFF (invalid)
295   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
296       ConvertUTFResultContainer(sourceIllegal)
297           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
298       "\xf7\xbf\xbf\xbf"));
299 
300   // U+3FFFFFF (invalid)
301   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
302       ConvertUTFResultContainer(sourceIllegal)
303           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
304       "\xfb\xbf\xbf\xbf\xbf"));
305 
306   // U+7FFFFFFF (invalid)
307   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
308       ConvertUTFResultContainer(sourceIllegal)
309           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
310       "\xfd\xbf\xbf\xbf\xbf\xbf"));
311 
312   //
313   // Other boundary conditions
314   //
315 
316   // U+D7FF (unassigned)
317   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
318       ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
319       "\xed\x9f\xbf"));
320 
321   // U+E000 (private use)
322   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
323       ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
324       "\xee\x80\x80"));
325 
326   // U+FFFD REPLACEMENT CHARACTER
327   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
328       ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
329       "\xef\xbf\xbd"));
330 
331   // U+10FFFF (noncharacter)
332   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
333       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
334       "\xf4\x8f\xbf\xbf"));
335 
336   // U+110000 (invalid)
337   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338       ConvertUTFResultContainer(sourceIllegal)
339           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
340       "\xf4\x90\x80\x80"));
341 
342   //
343   // Unexpected continuation bytes
344   //
345 
346   // A sequence of unexpected continuation bytes that don't follow a first
347   // byte, every byte is a maximal subpart.
348 
349   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
350       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
351   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
352       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
353   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
355       "\x80\x80"));
356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
358       "\x80\xbf"));
359   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
361       "\xbf\x80"));
362   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
363       ConvertUTFResultContainer(sourceIllegal)
364           .withScalars(0xfffd, 0xfffd, 0xfffd),
365       "\x80\xbf\x80"));
366   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
367       ConvertUTFResultContainer(sourceIllegal)
368           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
369       "\x80\xbf\x80\xbf"));
370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371       ConvertUTFResultContainer(sourceIllegal)
372           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
373       "\x80\xbf\x82\xbf\xaa"));
374   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
375       ConvertUTFResultContainer(sourceIllegal)
376           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
377       "\xaa\xb0\xbb\xbf\xaa\xa0"));
378   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
379       ConvertUTFResultContainer(sourceIllegal)
380           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
381       "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
382 
383   // All continuation bytes (0x80--0xbf).
384   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385       ConvertUTFResultContainer(sourceIllegal)
386           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
388           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
389                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
390           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
391                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
392           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
393                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
394           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
395                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
396           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
397                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
398           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
399                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
400           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
402       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
403       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
404       "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
405       "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
406 
407   //
408   // Lonely start bytes
409   //
410 
411   // Start bytes of 2-byte sequences (0xc0--0xdf).
412   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
413       ConvertUTFResultContainer(sourceIllegal)
414           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
415                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
416           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
417                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
418           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
419                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
420           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
421                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
422       "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
423       "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
424 
425   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
426       ConvertUTFResultContainer(sourceIllegal)
427           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428                        0xfffd, 0x0020, 0xfffd, 0x0020)
429           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
430                        0xfffd, 0x0020, 0xfffd, 0x0020)
431           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
432                        0xfffd, 0x0020, 0xfffd, 0x0020)
433           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
434                        0xfffd, 0x0020, 0xfffd, 0x0020)
435           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
436                        0xfffd, 0x0020, 0xfffd, 0x0020)
437           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
438                        0xfffd, 0x0020, 0xfffd, 0x0020)
439           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
440                        0xfffd, 0x0020, 0xfffd, 0x0020)
441           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
442                        0xfffd, 0x0020, 0xfffd, 0x0020),
443       "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
444       "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
445       "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
446       "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
447 
448   // Start bytes of 3-byte sequences (0xe0--0xef).
449   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
450       ConvertUTFResultContainer(sourceIllegal)
451           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
453           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
455       "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
456 
457   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458       ConvertUTFResultContainer(sourceIllegal)
459           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
460                        0xfffd, 0x0020, 0xfffd, 0x0020)
461           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
462                        0xfffd, 0x0020, 0xfffd, 0x0020)
463           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
464                        0xfffd, 0x0020, 0xfffd, 0x0020)
465           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                        0xfffd, 0x0020, 0xfffd, 0x0020),
467       "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
468       "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
469 
470   // Start bytes of 4-byte sequences (0xf0--0xf7).
471   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
472       ConvertUTFResultContainer(sourceIllegal)
473           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
474                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
475       "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
476 
477   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478       ConvertUTFResultContainer(sourceIllegal)
479           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480                        0xfffd, 0x0020, 0xfffd, 0x0020)
481           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
482                        0xfffd, 0x0020, 0xfffd, 0x0020),
483       "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
484 
485   // Start bytes of 5-byte sequences (0xf8--0xfb).
486   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
487       ConvertUTFResultContainer(sourceIllegal)
488           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
489       "\xf8\xf9\xfa\xfb"));
490 
491   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
492       ConvertUTFResultContainer(sourceIllegal)
493           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494                        0xfffd, 0x0020, 0xfffd, 0x0020),
495       "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
496 
497   // Start bytes of 6-byte sequences (0xfc--0xfd).
498   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
499       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
500       "\xfc\xfd"));
501 
502   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
503       ConvertUTFResultContainer(sourceIllegal)
504           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
505       "\xfc\x20\xfd\x20"));
506 
507   //
508   // Other bytes (0xc0--0xc1, 0xfe--0xff).
509   //
510 
511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
513   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
514       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
515   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
517   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
518       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
519 
520   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
521       ConvertUTFResultContainer(sourceIllegal)
522           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
523       "\xc0\xc1\xfe\xff"));
524 
525   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
526       ConvertUTFResultContainer(sourceIllegal)
527           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
528       "\xfe\xfe\xff\xff"));
529 
530   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
531       ConvertUTFResultContainer(sourceIllegal)
532           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533       "\xfe\x80\x80\x80\x80\x80"));
534 
535   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536       ConvertUTFResultContainer(sourceIllegal)
537           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
538       "\xff\x80\x80\x80\x80\x80"));
539 
540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541       ConvertUTFResultContainer(sourceIllegal)
542           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
543                        0xfffd, 0x0020, 0xfffd, 0x0020),
544       "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
545 
546   //
547   // Sequences with one continuation byte missing
548   //
549 
550   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
551       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
552   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
554   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
555       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
556       "\xe0\xa0"));
557   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
558       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
559       "\xe0\xbf"));
560   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
562       "\xe1\x80"));
563   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
564       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
565       "\xec\xbf"));
566   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
567       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
568       "\xed\x80"));
569   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
571       "\xed\x9f"));
572   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
573       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
574       "\xee\x80"));
575   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
577       "\xef\xbf"));
578   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
580       "\xf0\x90\x80"));
581   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
582       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
583       "\xf0\xbf\xbf"));
584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586       "\xf1\x80\x80"));
587   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589       "\xf3\xbf\xbf"));
590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
592       "\xf4\x80\x80"));
593   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
595       "\xf4\x8f\xbf"));
596 
597   // Overlong sequences with one trailing byte missing.
598   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
600       "\xc0"));
601   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
602       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
603       "\xc1"));
604   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
606       "\xe0\x80"));
607   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
608       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
609       "\xe0\x9f"));
610   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611       ConvertUTFResultContainer(sourceIllegal)
612           .withScalars(0xfffd, 0xfffd, 0xfffd),
613       "\xf0\x80\x80"));
614   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615       ConvertUTFResultContainer(sourceIllegal)
616           .withScalars(0xfffd, 0xfffd, 0xfffd),
617       "\xf0\x8f\x80"));
618   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619       ConvertUTFResultContainer(sourceIllegal)
620           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
621       "\xf8\x80\x80\x80"));
622   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
623       ConvertUTFResultContainer(sourceIllegal)
624           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
625       "\xfc\x80\x80\x80\x80"));
626 
627   // Sequences that represent surrogates with one trailing byte missing.
628   // High surrogates
629   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
630       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
631       "\xed\xa0"));
632   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
634       "\xed\xac"));
635   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
637       "\xed\xaf"));
638   // Low surrogates
639   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
641       "\xed\xb0"));
642   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
644       "\xed\xb4"));
645   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
647       "\xed\xbf"));
648 
649   // Ill-formed 4-byte sequences.
650   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
651   // U+1100xx (invalid)
652   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
653       ConvertUTFResultContainer(sourceIllegal)
654           .withScalars(0xfffd, 0xfffd, 0xfffd),
655       "\xf4\x90\x80"));
656   // U+13FBxx (invalid)
657   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
658       ConvertUTFResultContainer(sourceIllegal)
659           .withScalars(0xfffd, 0xfffd, 0xfffd),
660       "\xf4\xbf\xbf"));
661   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
662       ConvertUTFResultContainer(sourceIllegal)
663           .withScalars(0xfffd, 0xfffd, 0xfffd),
664       "\xf5\x80\x80"));
665   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666       ConvertUTFResultContainer(sourceIllegal)
667           .withScalars(0xfffd, 0xfffd, 0xfffd),
668       "\xf6\x80\x80"));
669   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
670       ConvertUTFResultContainer(sourceIllegal)
671           .withScalars(0xfffd, 0xfffd, 0xfffd),
672       "\xf7\x80\x80"));
673   // U+1FFBxx (invalid)
674   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
675       ConvertUTFResultContainer(sourceIllegal)
676           .withScalars(0xfffd, 0xfffd, 0xfffd),
677       "\xf7\xbf\xbf"));
678 
679   // Ill-formed 5-byte sequences.
680   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
681   // U+2000xx (invalid)
682   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
683       ConvertUTFResultContainer(sourceIllegal)
684           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
685       "\xf8\x88\x80\x80"));
686   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687       ConvertUTFResultContainer(sourceIllegal)
688           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
689       "\xf8\xbf\xbf\xbf"));
690   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691       ConvertUTFResultContainer(sourceIllegal)
692           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
693       "\xf9\x80\x80\x80"));
694   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
695       ConvertUTFResultContainer(sourceIllegal)
696           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
697       "\xfa\x80\x80\x80"));
698   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
699       ConvertUTFResultContainer(sourceIllegal)
700           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
701       "\xfb\x80\x80\x80"));
702   // U+3FFFFxx (invalid)
703   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704       ConvertUTFResultContainer(sourceIllegal)
705           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
706       "\xfb\xbf\xbf\xbf"));
707 
708   // Ill-formed 6-byte sequences.
709   // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
710   // U+40000xx (invalid)
711   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
712       ConvertUTFResultContainer(sourceIllegal)
713           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
714       "\xfc\x84\x80\x80\x80"));
715   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716       ConvertUTFResultContainer(sourceIllegal)
717           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
718       "\xfc\xbf\xbf\xbf\xbf"));
719   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720       ConvertUTFResultContainer(sourceIllegal)
721           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
722       "\xfd\x80\x80\x80\x80"));
723   // U+7FFFFFxx (invalid)
724   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
725       ConvertUTFResultContainer(sourceIllegal)
726           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
727       "\xfd\xbf\xbf\xbf\xbf"));
728 
729   //
730   // Sequences with two continuation bytes missing
731   //
732 
733   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
734       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
735       "\xf0\x90"));
736   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
737       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
738       "\xf0\xbf"));
739   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
741       "\xf1\x80"));
742   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
743       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
744       "\xf3\xbf"));
745   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
746       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
747       "\xf4\x80"));
748   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
750       "\xf4\x8f"));
751 
752   // Overlong sequences with two trailing byte missing.
753   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
754       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
755   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
756       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
757       "\xf0\x80"));
758   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
759       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
760       "\xf0\x8f"));
761   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762       ConvertUTFResultContainer(sourceIllegal)
763           .withScalars(0xfffd, 0xfffd, 0xfffd),
764       "\xf8\x80\x80"));
765   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
766       ConvertUTFResultContainer(sourceIllegal)
767           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
768       "\xfc\x80\x80\x80"));
769 
770   // Sequences that represent surrogates with two trailing bytes missing.
771   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
772       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
773 
774   // Ill-formed 4-byte sequences.
775   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
776   // U+110yxx (invalid)
777   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
779       "\xf4\x90"));
780   // U+13Fyxx (invalid)
781   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
783       "\xf4\xbf"));
784   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
785       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
786       "\xf5\x80"));
787   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
789       "\xf6\x80"));
790   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
792       "\xf7\x80"));
793   // U+1FFyxx (invalid)
794   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
796       "\xf7\xbf"));
797 
798   // Ill-formed 5-byte sequences.
799   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
800   // U+200yxx (invalid)
801   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
802       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
803       "\xf8\x88\x80"));
804   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
805       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
806       "\xf8\xbf\xbf"));
807   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
808       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
809       "\xf9\x80\x80"));
810   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
812       "\xfa\x80\x80"));
813   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
815       "\xfb\x80\x80"));
816   // U+3FFFyxx (invalid)
817   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
818       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
819       "\xfb\xbf\xbf"));
820 
821   // Ill-formed 6-byte sequences.
822   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
823   // U+4000yxx (invalid)
824   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
825       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826       "\xfc\x84\x80\x80"));
827   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
828       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
829       "\xfc\xbf\xbf\xbf"));
830   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
832       "\xfd\x80\x80\x80"));
833   // U+7FFFFyxx (invalid)
834   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
836       "\xfd\xbf\xbf\xbf"));
837 
838   //
839   // Sequences with three continuation bytes missing
840   //
841 
842   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
844   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
846   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
847       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
848   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
850   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
851       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
852 
853   // Broken overlong sequences.
854   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
856   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
857       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
858       "\xf8\x80"));
859   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
861       "\xfc\x80\x80"));
862 
863   // Ill-formed 4-byte sequences.
864   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
865   // U+14yyxx (invalid)
866   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
868   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
870   // U+1Cyyxx (invalid)
871   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
873 
874   // Ill-formed 5-byte sequences.
875   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
876   // U+20yyxx (invalid)
877   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
878       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
879       "\xf8\x88"));
880   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
881       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
882       "\xf8\xbf"));
883   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
884       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
885       "\xf9\x80"));
886   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
888       "\xfa\x80"));
889   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
891       "\xfb\x80"));
892   // U+3FCyyxx (invalid)
893   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
894       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
895       "\xfb\xbf"));
896 
897   // Ill-formed 6-byte sequences.
898   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
899   // U+400yyxx (invalid)
900   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
902       "\xfc\x84\x80"));
903   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
904       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
905       "\xfc\xbf\xbf"));
906   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
908       "\xfd\x80\x80"));
909   // U+7FFCyyxx (invalid)
910   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
911       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
912       "\xfd\xbf\xbf"));
913 
914   //
915   // Sequences with four continuation bytes missing
916   //
917 
918   // Ill-formed 5-byte sequences.
919   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
920   // U+uzyyxx (invalid)
921   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
922       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
923   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
925   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
926       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
927   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
928       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
929   // U+3zyyxx (invalid)
930   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
931       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
932 
933   // Broken overlong sequences.
934   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
936   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
937       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
938       "\xfc\x80"));
939 
940   // Ill-formed 6-byte sequences.
941   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
942   // U+uzzyyxx (invalid)
943   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
945       "\xfc\x84"));
946   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
947       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
948       "\xfc\xbf"));
949   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
950       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
951       "\xfd\x80"));
952   // U+7Fzzyyxx (invalid)
953   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
955       "\xfd\xbf"));
956 
957   //
958   // Sequences with five continuation bytes missing
959   //
960 
961   // Ill-formed 6-byte sequences.
962   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
963   // U+uzzyyxx (invalid)
964   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
966   // U+uuzzyyxx (invalid)
967   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
969 
970   //
971   // Consecutive sequences with trailing bytes missing
972   //
973 
974   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
975       ConvertUTFResultContainer(sourceIllegal)
976           .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
977           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
978           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
979           .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
980           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
981           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
982       "\xc0" "\xe0\x80" "\xf0\x80\x80"
983       "\xf8\x80\x80\x80"
984       "\xfc\x80\x80\x80\x80"
985       "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
986       "\xfb\xbf\xbf\xbf"
987       "\xfd\xbf\xbf\xbf\xbf"));
988 
989   //
990   // Overlong UTF-8 sequences
991   //
992 
993   // U+002F SOLIDUS
994   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995       ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
996 
997   // Overlong sequences of the above.
998   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
999       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1000       "\xc0\xaf"));
1001   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002       ConvertUTFResultContainer(sourceIllegal)
1003           .withScalars(0xfffd, 0xfffd, 0xfffd),
1004       "\xe0\x80\xaf"));
1005   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006       ConvertUTFResultContainer(sourceIllegal)
1007           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1008       "\xf0\x80\x80\xaf"));
1009   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1010       ConvertUTFResultContainer(sourceIllegal)
1011           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1012       "\xf8\x80\x80\x80\xaf"));
1013   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014       ConvertUTFResultContainer(sourceIllegal)
1015           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1016       "\xfc\x80\x80\x80\x80\xaf"));
1017 
1018   // U+0000 NULL
1019   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1020       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1021       StringRef("\x00", 1)));
1022 
1023   // Overlong sequences of the above.
1024   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1025       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1026       "\xc0\x80"));
1027   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1028       ConvertUTFResultContainer(sourceIllegal)
1029           .withScalars(0xfffd, 0xfffd, 0xfffd),
1030       "\xe0\x80\x80"));
1031   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032       ConvertUTFResultContainer(sourceIllegal)
1033           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1034       "\xf0\x80\x80\x80"));
1035   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1036       ConvertUTFResultContainer(sourceIllegal)
1037           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1038       "\xf8\x80\x80\x80\x80"));
1039   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1040       ConvertUTFResultContainer(sourceIllegal)
1041           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1042       "\xfc\x80\x80\x80\x80\x80"));
1043 
1044   // Other overlong sequences.
1045   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1046       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1047       "\xc0\xbf"));
1048   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1050       "\xc1\x80"));
1051   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1052       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1053       "\xc1\xbf"));
1054   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1055       ConvertUTFResultContainer(sourceIllegal)
1056           .withScalars(0xfffd, 0xfffd, 0xfffd),
1057       "\xe0\x9f\xbf"));
1058   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059       ConvertUTFResultContainer(sourceIllegal)
1060           .withScalars(0xfffd, 0xfffd, 0xfffd),
1061       "\xed\xa0\x80"));
1062   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1063       ConvertUTFResultContainer(sourceIllegal)
1064           .withScalars(0xfffd, 0xfffd, 0xfffd),
1065       "\xed\xbf\xbf"));
1066   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1067       ConvertUTFResultContainer(sourceIllegal)
1068           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1069       "\xf0\x8f\x80\x80"));
1070   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1071       ConvertUTFResultContainer(sourceIllegal)
1072           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1073       "\xf0\x8f\xbf\xbf"));
1074   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1075       ConvertUTFResultContainer(sourceIllegal)
1076           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1077       "\xf8\x87\xbf\xbf\xbf"));
1078   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1079       ConvertUTFResultContainer(sourceIllegal)
1080           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1081       "\xfc\x83\xbf\xbf\xbf\xbf"));
1082 
1083   //
1084   // Isolated surrogates
1085   //
1086 
1087   // Unicode 6.3.0:
1088   //
1089   //    D71.  High-surrogate code point: A Unicode code point in the range
1090   //    U+D800 to U+DBFF.
1091   //
1092   //    D73.  Low-surrogate code point: A Unicode code point in the range
1093   //    U+DC00 to U+DFFF.
1094 
1095   // Note: U+E0100 is <DB40 DD00> in UTF16.
1096 
1097   // High surrogates
1098 
1099   // U+D800
1100   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1101       ConvertUTFResultContainer(sourceIllegal)
1102           .withScalars(0xfffd, 0xfffd, 0xfffd),
1103       "\xed\xa0\x80"));
1104 
1105   // U+DB40
1106   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107       ConvertUTFResultContainer(sourceIllegal)
1108           .withScalars(0xfffd, 0xfffd, 0xfffd),
1109       "\xed\xac\xa0"));
1110 
1111   // U+DBFF
1112   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113       ConvertUTFResultContainer(sourceIllegal)
1114           .withScalars(0xfffd, 0xfffd, 0xfffd),
1115       "\xed\xaf\xbf"));
1116 
1117   // Low surrogates
1118 
1119   // U+DC00
1120   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121       ConvertUTFResultContainer(sourceIllegal)
1122           .withScalars(0xfffd, 0xfffd, 0xfffd),
1123       "\xed\xb0\x80"));
1124 
1125   // U+DD00
1126   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127       ConvertUTFResultContainer(sourceIllegal)
1128           .withScalars(0xfffd, 0xfffd, 0xfffd),
1129       "\xed\xb4\x80"));
1130 
1131   // U+DFFF
1132   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133       ConvertUTFResultContainer(sourceIllegal)
1134           .withScalars(0xfffd, 0xfffd, 0xfffd),
1135       "\xed\xbf\xbf"));
1136 
1137   // Surrogate pairs
1138 
1139   // U+D800 U+DC00
1140   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1141       ConvertUTFResultContainer(sourceIllegal)
1142           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1143       "\xed\xa0\x80\xed\xb0\x80"));
1144 
1145   // U+D800 U+DD00
1146   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1147       ConvertUTFResultContainer(sourceIllegal)
1148           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1149       "\xed\xa0\x80\xed\xb4\x80"));
1150 
1151   // U+D800 U+DFFF
1152   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1153       ConvertUTFResultContainer(sourceIllegal)
1154           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1155       "\xed\xa0\x80\xed\xbf\xbf"));
1156 
1157   // U+DB40 U+DC00
1158   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159       ConvertUTFResultContainer(sourceIllegal)
1160           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1161       "\xed\xac\xa0\xed\xb0\x80"));
1162 
1163   // U+DB40 U+DD00
1164   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165       ConvertUTFResultContainer(sourceIllegal)
1166           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1167       "\xed\xac\xa0\xed\xb4\x80"));
1168 
1169   // U+DB40 U+DFFF
1170   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171       ConvertUTFResultContainer(sourceIllegal)
1172           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1173       "\xed\xac\xa0\xed\xbf\xbf"));
1174 
1175   // U+DBFF U+DC00
1176   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1177       ConvertUTFResultContainer(sourceIllegal)
1178           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1179       "\xed\xaf\xbf\xed\xb0\x80"));
1180 
1181   // U+DBFF U+DD00
1182   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1183       ConvertUTFResultContainer(sourceIllegal)
1184           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1185       "\xed\xaf\xbf\xed\xb4\x80"));
1186 
1187   // U+DBFF U+DFFF
1188   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1189       ConvertUTFResultContainer(sourceIllegal)
1190           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1191       "\xed\xaf\xbf\xed\xbf\xbf"));
1192 
1193   //
1194   // Noncharacters
1195   //
1196 
1197   // Unicode 6.3.0:
1198   //
1199   //    D14.  Noncharacter: A code point that is permanently reserved for
1200   //    internal use and that should never be interchanged. Noncharacters
1201   //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1202   //    and the values U+FDD0..U+FDEF.
1203 
1204   // U+FFFE
1205   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1206       ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1207       "\xef\xbf\xbe"));
1208 
1209   // U+FFFF
1210   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1212       "\xef\xbf\xbf"));
1213 
1214   // U+1FFFE
1215   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216       ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1217       "\xf0\x9f\xbf\xbe"));
1218 
1219   // U+1FFFF
1220   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1221       ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1222       "\xf0\x9f\xbf\xbf"));
1223 
1224   // U+2FFFE
1225   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1226       ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1227       "\xf0\xaf\xbf\xbe"));
1228 
1229   // U+2FFFF
1230   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1231       ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1232       "\xf0\xaf\xbf\xbf"));
1233 
1234   // U+3FFFE
1235   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1236       ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1237       "\xf0\xbf\xbf\xbe"));
1238 
1239   // U+3FFFF
1240   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241       ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1242       "\xf0\xbf\xbf\xbf"));
1243 
1244   // U+4FFFE
1245   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1246       ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1247       "\xf1\x8f\xbf\xbe"));
1248 
1249   // U+4FFFF
1250   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1251       ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1252       "\xf1\x8f\xbf\xbf"));
1253 
1254   // U+5FFFE
1255   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1256       ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1257       "\xf1\x9f\xbf\xbe"));
1258 
1259   // U+5FFFF
1260   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1261       ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1262       "\xf1\x9f\xbf\xbf"));
1263 
1264   // U+6FFFE
1265   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1266       ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1267       "\xf1\xaf\xbf\xbe"));
1268 
1269   // U+6FFFF
1270   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1271       ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1272       "\xf1\xaf\xbf\xbf"));
1273 
1274   // U+7FFFE
1275   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1276       ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1277       "\xf1\xbf\xbf\xbe"));
1278 
1279   // U+7FFFF
1280   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1281       ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1282       "\xf1\xbf\xbf\xbf"));
1283 
1284   // U+8FFFE
1285   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1286       ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1287       "\xf2\x8f\xbf\xbe"));
1288 
1289   // U+8FFFF
1290   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1291       ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1292       "\xf2\x8f\xbf\xbf"));
1293 
1294   // U+9FFFE
1295   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1296       ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1297       "\xf2\x9f\xbf\xbe"));
1298 
1299   // U+9FFFF
1300   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1301       ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1302       "\xf2\x9f\xbf\xbf"));
1303 
1304   // U+AFFFE
1305   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1306       ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1307       "\xf2\xaf\xbf\xbe"));
1308 
1309   // U+AFFFF
1310   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1311       ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1312       "\xf2\xaf\xbf\xbf"));
1313 
1314   // U+BFFFE
1315   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1316       ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1317       "\xf2\xbf\xbf\xbe"));
1318 
1319   // U+BFFFF
1320   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1321       ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1322       "\xf2\xbf\xbf\xbf"));
1323 
1324   // U+CFFFE
1325   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1326       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1327       "\xf3\x8f\xbf\xbe"));
1328 
1329   // U+CFFFF
1330   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1331       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1332       "\xf3\x8f\xbf\xbf"));
1333 
1334   // U+DFFFE
1335   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1336       ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1337       "\xf3\x9f\xbf\xbe"));
1338 
1339   // U+DFFFF
1340   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1341       ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1342       "\xf3\x9f\xbf\xbf"));
1343 
1344   // U+EFFFE
1345   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1346       ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1347       "\xf3\xaf\xbf\xbe"));
1348 
1349   // U+EFFFF
1350   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1351       ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1352       "\xf3\xaf\xbf\xbf"));
1353 
1354   // U+FFFFE
1355   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1356       ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1357       "\xf3\xbf\xbf\xbe"));
1358 
1359   // U+FFFFF
1360   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1361       ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1362       "\xf3\xbf\xbf\xbf"));
1363 
1364   // U+10FFFE
1365   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1366       ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1367       "\xf4\x8f\xbf\xbe"));
1368 
1369   // U+10FFFF
1370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1371       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1372       "\xf4\x8f\xbf\xbf"));
1373 
1374   // U+FDD0
1375   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1376       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1377       "\xef\xb7\x90"));
1378 
1379   // U+FDD1
1380   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1381       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1382       "\xef\xb7\x91"));
1383 
1384   // U+FDD2
1385   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1386       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1387       "\xef\xb7\x92"));
1388 
1389   // U+FDD3
1390   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1391       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1392       "\xef\xb7\x93"));
1393 
1394   // U+FDD4
1395   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1396       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1397       "\xef\xb7\x94"));
1398 
1399   // U+FDD5
1400   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1401       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1402       "\xef\xb7\x95"));
1403 
1404   // U+FDD6
1405   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1406       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1407       "\xef\xb7\x96"));
1408 
1409   // U+FDD7
1410   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1411       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1412       "\xef\xb7\x97"));
1413 
1414   // U+FDD8
1415   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1416       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1417       "\xef\xb7\x98"));
1418 
1419   // U+FDD9
1420   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1421       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1422       "\xef\xb7\x99"));
1423 
1424   // U+FDDA
1425   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1426       ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1427       "\xef\xb7\x9a"));
1428 
1429   // U+FDDB
1430   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1431       ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1432       "\xef\xb7\x9b"));
1433 
1434   // U+FDDC
1435   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1436       ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1437       "\xef\xb7\x9c"));
1438 
1439   // U+FDDD
1440   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1441       ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1442       "\xef\xb7\x9d"));
1443 
1444   // U+FDDE
1445   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1446       ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1447       "\xef\xb7\x9e"));
1448 
1449   // U+FDDF
1450   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1451       ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1452       "\xef\xb7\x9f"));
1453 
1454   // U+FDE0
1455   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1456       ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1457       "\xef\xb7\xa0"));
1458 
1459   // U+FDE1
1460   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1461       ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1462       "\xef\xb7\xa1"));
1463 
1464   // U+FDE2
1465   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1466       ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1467       "\xef\xb7\xa2"));
1468 
1469   // U+FDE3
1470   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1471       ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1472       "\xef\xb7\xa3"));
1473 
1474   // U+FDE4
1475   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1476       ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1477       "\xef\xb7\xa4"));
1478 
1479   // U+FDE5
1480   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1481       ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1482       "\xef\xb7\xa5"));
1483 
1484   // U+FDE6
1485   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1486       ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1487       "\xef\xb7\xa6"));
1488 
1489   // U+FDE7
1490   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1491       ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1492       "\xef\xb7\xa7"));
1493 
1494   // U+FDE8
1495   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1496       ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1497       "\xef\xb7\xa8"));
1498 
1499   // U+FDE9
1500   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1501       ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1502       "\xef\xb7\xa9"));
1503 
1504   // U+FDEA
1505   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1506       ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1507       "\xef\xb7\xaa"));
1508 
1509   // U+FDEB
1510   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1511       ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1512       "\xef\xb7\xab"));
1513 
1514   // U+FDEC
1515   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1516       ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1517       "\xef\xb7\xac"));
1518 
1519   // U+FDED
1520   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1521       ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1522       "\xef\xb7\xad"));
1523 
1524   // U+FDEE
1525   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1526       ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1527       "\xef\xb7\xae"));
1528 
1529   // U+FDEF
1530   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1531       ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1532       "\xef\xb7\xaf"));
1533 
1534   // U+FDF0
1535   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1536       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1537       "\xef\xb7\xb0"));
1538 
1539   // U+FDF1
1540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1541       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1542       "\xef\xb7\xb1"));
1543 
1544   // U+FDF2
1545   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1546       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1547       "\xef\xb7\xb2"));
1548 
1549   // U+FDF3
1550   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1551       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1552       "\xef\xb7\xb3"));
1553 
1554   // U+FDF4
1555   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1556       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1557       "\xef\xb7\xb4"));
1558 
1559   // U+FDF5
1560   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1561       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1562       "\xef\xb7\xb5"));
1563 
1564   // U+FDF6
1565   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1566       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1567       "\xef\xb7\xb6"));
1568 
1569   // U+FDF7
1570   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1571       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1572       "\xef\xb7\xb7"));
1573 
1574   // U+FDF8
1575   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1576       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1577       "\xef\xb7\xb8"));
1578 
1579   // U+FDF9
1580   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1581       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1582       "\xef\xb7\xb9"));
1583 
1584   // U+FDFA
1585   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1586       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1587       "\xef\xb7\xba"));
1588 
1589   // U+FDFB
1590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1591       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1592       "\xef\xb7\xbb"));
1593 
1594   // U+FDFC
1595   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1596       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1597       "\xef\xb7\xbc"));
1598 
1599   // U+FDFD
1600   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1601       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1602       "\xef\xb7\xbd"));
1603 
1604   // U+FDFE
1605   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1606       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1607       "\xef\xb7\xbe"));
1608 
1609   // U+FDFF
1610   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1611       ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1612       "\xef\xb7\xbf"));
1613 }
1614 
TEST(ConvertUTFTest,UTF8ToUTF32PartialLenient)1615 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1616   // U+0041 LATIN CAPITAL LETTER A
1617   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618       ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1619       "\x41", true));
1620 
1621   //
1622   // Sequences with one continuation byte missing
1623   //
1624 
1625   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1626       ConvertUTFResultContainer(sourceExhausted),
1627       "\xc2", true));
1628   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629       ConvertUTFResultContainer(sourceExhausted),
1630       "\xdf", true));
1631   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1632       ConvertUTFResultContainer(sourceExhausted),
1633       "\xe0\xa0", true));
1634   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1635       ConvertUTFResultContainer(sourceExhausted),
1636       "\xe0\xbf", true));
1637   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1638       ConvertUTFResultContainer(sourceExhausted),
1639       "\xe1\x80", true));
1640   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1641       ConvertUTFResultContainer(sourceExhausted),
1642       "\xec\xbf", true));
1643   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644       ConvertUTFResultContainer(sourceExhausted),
1645       "\xed\x80", true));
1646   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1647       ConvertUTFResultContainer(sourceExhausted),
1648       "\xed\x9f", true));
1649   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1650       ConvertUTFResultContainer(sourceExhausted),
1651       "\xee\x80", true));
1652   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1653       ConvertUTFResultContainer(sourceExhausted),
1654       "\xef\xbf", true));
1655   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1656       ConvertUTFResultContainer(sourceExhausted),
1657       "\xf0\x90\x80", true));
1658   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659       ConvertUTFResultContainer(sourceExhausted),
1660       "\xf0\xbf\xbf", true));
1661   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1662       ConvertUTFResultContainer(sourceExhausted),
1663       "\xf1\x80\x80", true));
1664   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1665       ConvertUTFResultContainer(sourceExhausted),
1666       "\xf3\xbf\xbf", true));
1667   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668       ConvertUTFResultContainer(sourceExhausted),
1669       "\xf4\x80\x80", true));
1670   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1671       ConvertUTFResultContainer(sourceExhausted),
1672       "\xf4\x8f\xbf", true));
1673 
1674   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675       ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1676       "\x41\xc2", true));
1677 }
1678 
1679