1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utf.h"
18 
19 #include "common_runtime_test.h"
20 #include "utf-inl.h"
21 
22 #include <map>
23 #include <vector>
24 
25 namespace art {
26 
27 class UtfTest : public CommonRuntimeTest {};
28 
TEST_F(UtfTest,GetLeadingUtf16Char)29 TEST_F(UtfTest, GetLeadingUtf16Char) {
30   EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
31 }
32 
TEST_F(UtfTest,GetTrailingUtf16Char)33 TEST_F(UtfTest, GetTrailingUtf16Char) {
34   EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
35   EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
36 }
37 
38 #define EXPECT_ARRAY_POSITION(expected, end, start) \
39   EXPECT_EQ(static_cast<uintptr_t>(expected), \
40             reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
41 
42 // A test string containing one, two, three and four byte UTF-8 sequences.
43 static const uint8_t kAllSequences[] = {
44     0x24,
45     0xc2, 0xa2,
46     0xe2, 0x82, 0xac,
47     0xf0, 0x9f, 0x8f, 0xa0,
48     0x00
49 };
50 
51 // A test string that contains a UTF-8 encoding of a surrogate pair
52 // (code point = U+10400).
53 static const uint8_t kSurrogateEncoding[] = {
54     0xed, 0xa0, 0x81,
55     0xed, 0xb0, 0x80,
56     0x00
57 };
58 
TEST_F(UtfTest,GetUtf16FromUtf8)59 TEST_F(UtfTest, GetUtf16FromUtf8) {
60   const char* const start = reinterpret_cast<const char*>(kAllSequences);
61   const char* ptr = start;
62   uint32_t pair = 0;
63 
64   // Single byte sequence.
65   pair = GetUtf16FromUtf8(&ptr);
66   EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
67   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
68   EXPECT_ARRAY_POSITION(1, ptr, start);
69 
70   // Two byte sequence.
71   pair = GetUtf16FromUtf8(&ptr);
72   EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
73   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
74   EXPECT_ARRAY_POSITION(3, ptr, start);
75 
76   // Three byte sequence.
77   pair = GetUtf16FromUtf8(&ptr);
78   EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
79   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
80   EXPECT_ARRAY_POSITION(6, ptr, start);
81 
82   // Four byte sequence
83   pair = GetUtf16FromUtf8(&ptr);
84   EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
85   EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
86   EXPECT_ARRAY_POSITION(10, ptr, start);
87 
88   // Null terminator.
89   pair = GetUtf16FromUtf8(&ptr);
90   EXPECT_EQ(0, GetLeadingUtf16Char(pair));
91   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
92   EXPECT_ARRAY_POSITION(11, ptr, start);
93 }
94 
TEST_F(UtfTest,GetUtf16FromUtf8_SurrogatesPassThrough)95 TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
96   const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
97   const char* ptr = start;
98   uint32_t pair = 0;
99 
100   pair = GetUtf16FromUtf8(&ptr);
101   EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
102   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
103   EXPECT_ARRAY_POSITION(3, ptr, start);
104 
105   pair = GetUtf16FromUtf8(&ptr);
106   EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
107   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
108   EXPECT_ARRAY_POSITION(6, ptr, start);
109 }
110 
TEST_F(UtfTest,CountModifiedUtf8Chars)111 TEST_F(UtfTest, CountModifiedUtf8Chars) {
112   EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
113   EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
114 }
115 
AssertConversion(const std::vector<uint16_t> & input,const std::vector<uint8_t> & expected)116 static void AssertConversion(const std::vector<uint16_t>& input,
117                              const std::vector<uint8_t>& expected) {
118   ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
119 
120   std::vector<uint8_t> output(expected.size());
121   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
122                              &input[0], input.size());
123   EXPECT_EQ(expected, output);
124 }
125 
TEST_F(UtfTest,CountAndConvertUtf8Bytes)126 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
127   // Surrogate pairs will be converted into 4 byte sequences.
128   AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
129 
130   // Three byte encodings that are below & above the leading surrogate
131   // range respectively.
132   AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
133   AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
134   // Two byte encoding.
135   AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
136 
137   // Two byte special case : 0 must use an overlong encoding.
138   AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
139 
140   // One byte encoding.
141   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
142 
143   AssertConversion({
144       0xd802, 0xdc02,  // Surrogate pair.
145       0xdef0, 0xdcff,  // Three byte encodings.
146       0x0101, 0x0000,  // Two byte encodings.
147       'p'   , 'p'      // One byte encoding.
148     }, {
149       0xf0, 0x90, 0xa0, 0x82,
150       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
151       0xc4, 0x81, 0xc0, 0x80,
152       0x70, 0x70
153     });
154 }
155 
TEST_F(UtfTest,CountAndConvertUtf8Bytes_UnpairedSurrogate)156 TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
157   // Unpaired trailing surrogate at the end of input.
158   AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
159   // Unpaired (or incorrectly paired) surrogates in the middle of the input.
160   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
161       {{ 'h' }, { 'h' }},
162       {{ 0 }, { 0xc0, 0x80 }},
163       {{ 0x81 }, { 0xc2, 0x81 }},
164       {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
165   };
166   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
167       {{ 'e' }, { 'e' }},
168       {{ 0 }, { 0xc0, 0x80 }},
169       {{ 0x7ff }, { 0xdf, 0xbf }},
170       {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
171   };
172   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
173       {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
174       {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
175       {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
176       {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
177   };
178   for (const auto& prefix : prefixes) {
179     const std::vector<uint16_t>& prefix_in = prefix.first;
180     const std::vector<uint8_t>& prefix_out = prefix.second;
181     for (const auto& test : tests) {
182       const std::vector<uint16_t>& test_in = test.first;
183       const std::vector<uint8_t>& test_out = test.second;
184       for (const auto& suffix : suffixes) {
185         const std::vector<uint16_t>& suffix_in = suffix.first;
186         const std::vector<uint8_t>& suffix_out = suffix.second;
187         std::vector<uint16_t> in = prefix_in;
188         in.insert(in.end(), test_in.begin(), test_in.end());
189         in.insert(in.end(), suffix_in.begin(), suffix_in.end());
190         std::vector<uint8_t> out = prefix_out;
191         out.insert(out.end(), test_out.begin(), test_out.end());
192         out.insert(out.end(), suffix_out.begin(), suffix_out.end());
193         AssertConversion(in, out);
194       }
195     }
196   }
197 }
198 
199 // Old versions of functions, here to compare answers with optimized versions.
200 
CountModifiedUtf8Chars_reference(const char * utf8)201 size_t CountModifiedUtf8Chars_reference(const char* utf8) {
202   size_t len = 0;
203   int ic;
204   while ((ic = *utf8++) != '\0') {
205     len++;
206     if ((ic & 0x80) == 0) {
207       // one-byte encoding
208       continue;
209     }
210     // two- or three-byte encoding
211     utf8++;
212     if ((ic & 0x20) == 0) {
213       // two-byte encoding
214       continue;
215     }
216     utf8++;
217     if ((ic & 0x10) == 0) {
218       // three-byte encoding
219       continue;
220     }
221 
222     // four-byte encoding: needs to be converted into a surrogate
223     // pair.
224     utf8++;
225     len++;
226   }
227   return len;
228 }
229 
CountUtf8Bytes_reference(const uint16_t * chars,size_t char_count)230 static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
231   size_t result = 0;
232   while (char_count--) {
233     const uint16_t ch = *chars++;
234     if (ch > 0 && ch <= 0x7f) {
235       ++result;
236     } else if (ch >= 0xd800 && ch <= 0xdbff) {
237       if (char_count > 0) {
238         const uint16_t ch2 = *chars;
239         // If we find a properly paired surrogate, we emit it as a 4 byte
240         // UTF sequence. If we find an unpaired leading or trailing surrogate,
241         // we emit it as a 3 byte sequence like would have done earlier.
242         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
243           chars++;
244           char_count--;
245 
246           result += 4;
247         } else {
248           result += 3;
249         }
250       } else {
251         // This implies we found an unpaired trailing surrogate at the end
252         // of a string.
253         result += 3;
254       }
255     } else if (ch > 0x7ff) {
256       result += 3;
257     } else {
258       result += 2;
259     }
260   }
261   return result;
262 }
263 
ConvertUtf16ToModifiedUtf8_reference(char * utf8_out,const uint16_t * utf16_in,size_t char_count)264 static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
265                                                  size_t char_count) {
266   while (char_count--) {
267     const uint16_t ch = *utf16_in++;
268     if (ch > 0 && ch <= 0x7f) {
269       *utf8_out++ = ch;
270     } else {
271       // Char_count == 0 here implies we've encountered an unpaired
272       // surrogate and we have no choice but to encode it as 3-byte UTF
273       // sequence. Note that unpaired surrogates can occur as a part of
274       // "normal" operation.
275       if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
276         const uint16_t ch2 = *utf16_in;
277 
278         // Check if the other half of the pair is within the expected
279         // range. If it isn't, we will have to emit both "halves" as
280         // separate 3 byte sequences.
281         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
282           utf16_in++;
283           char_count--;
284           const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
285           *utf8_out++ = (code_point >> 18) | 0xf0;
286           *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
287           *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
288           *utf8_out++ = (code_point & 0x3f) | 0x80;
289           continue;
290         }
291       }
292 
293       if (ch > 0x07ff) {
294         // Three byte encoding.
295         *utf8_out++ = (ch >> 12) | 0xe0;
296         *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
297         *utf8_out++ = (ch & 0x3f) | 0x80;
298       } else /*(ch > 0x7f || ch == 0)*/ {
299         // Two byte encoding.
300         *utf8_out++ = (ch >> 6) | 0xc0;
301         *utf8_out++ = (ch & 0x3f) | 0x80;
302       }
303     }
304   }
305 }
306 
307 // Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
308 
codePointToSurrogatePair(uint32_t code_point,uint16_t & first,uint16_t & second)309 static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
310   first = (code_point >> 10) + 0xd7c0;
311   second = (code_point & 0x03ff) + 0xdc00;
312 }
313 
testConversions(uint16_t * buf,int char_count)314 static void testConversions(uint16_t *buf, int char_count) {
315   char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
316   uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
317   int byte_count_test, byte_count_reference;
318   int char_count_test, char_count_reference;
319 
320   // Calculate the number of utf-8 bytes for the utf-16 chars.
321   byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
322   byte_count_test = CountUtf8Bytes(buf, char_count);
323   EXPECT_EQ(byte_count_reference, byte_count_test);
324 
325   // Convert the utf-16 string to utf-8 bytes.
326   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
327   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
328   for (int i = 0; i < byte_count_test; ++i) {
329     EXPECT_EQ(bytes_reference[i], bytes_test[i]);
330   }
331 
332   // Calculate the number of utf-16 chars from the utf-8 bytes.
333   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
334   char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
335   char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
336   EXPECT_EQ(char_count, char_count_reference);
337   EXPECT_EQ(char_count, char_count_test);
338 
339   // Convert the utf-8 bytes back to utf-16 chars.
340   // Does not need copied _reference version of the function because the original
341   // function with the old API is retained for debug/testing code.
342   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
343   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
344   for (int i = 0; i < char_count_test; ++i) {
345     EXPECT_EQ(buf[i], out_buf_reference[i]);
346     EXPECT_EQ(buf[i], out_buf_test[i]);
347   }
348 }
349 
TEST_F(UtfTest,ExhaustiveBidirectionalCodePointCheck)350 TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
351   for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
352     uint16_t buf[4] = { 0 };
353     if (codePoint <= 0xffff) {
354       if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
355         // According to the Unicode standard, no character will ever
356         // be assigned to these code points, and they cannot be encoded
357         // into either utf-16 or utf-8.
358         continue;
359       }
360       buf[0] = 'h';
361       buf[1] = codePoint;
362       buf[2] = 'e';
363       testConversions(buf, 2);
364       testConversions(buf, 3);
365       testConversions(buf + 1, 1);
366       testConversions(buf + 1, 2);
367     } else {
368       buf[0] = 'h';
369       codePointToSurrogatePair(codePoint, buf[1], buf[2]);
370       buf[3] = 'e';
371       testConversions(buf, 2);
372       testConversions(buf, 3);
373       testConversions(buf, 4);
374       testConversions(buf + 1, 1);
375       testConversions(buf + 1, 2);
376       testConversions(buf + 1, 3);
377     }
378   }
379 }
380 
381 }  // namespace art
382