1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utf.h"
18 
19 #include "common_runtime_test.h"
20 #include "utf-inl.h"
21 
22 #include <vector>
23 
24 namespace art {
25 
26 class UtfTest : public CommonRuntimeTest {};
27 
TEST_F(UtfTest,GetLeadingUtf16Char)28 TEST_F(UtfTest, GetLeadingUtf16Char) {
29   EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
30 }
31 
TEST_F(UtfTest,GetTrailingUtf16Char)32 TEST_F(UtfTest, GetTrailingUtf16Char) {
33   EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
34   EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
35 }
36 
37 #define EXPECT_ARRAY_POSITION(expected, end, start) \
38   EXPECT_EQ(static_cast<uintptr_t>(expected), \
39             reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
40 
41 // A test string containing one, two, three and four byte UTF-8 sequences.
42 static const uint8_t kAllSequences[] = {
43     0x24,
44     0xc2, 0xa2,
45     0xe2, 0x82, 0xac,
46     0xf0, 0x9f, 0x8f, 0xa0,
47     0x00
48 };
49 
50 // A test string that contains a UTF-8 encoding of a surrogate pair
51 // (code point = U+10400)
52 static const uint8_t kSurrogateEncoding[] = {
53     0xed, 0xa0, 0x81,
54     0xed, 0xb0, 0x80,
55     0x00
56 };
57 
TEST_F(UtfTest,GetUtf16FromUtf8)58 TEST_F(UtfTest, GetUtf16FromUtf8) {
59   const char* const start = reinterpret_cast<const char*>(kAllSequences);
60   const char* ptr = start;
61   uint32_t pair = 0;
62 
63   // Single byte sequence.
64   pair = GetUtf16FromUtf8(&ptr);
65   EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
66   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
67   EXPECT_ARRAY_POSITION(1, ptr, start);
68 
69   // Two byte sequence
70   pair = GetUtf16FromUtf8(&ptr);
71   EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
72   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
73   EXPECT_ARRAY_POSITION(3, ptr, start);
74 
75   // Three byte sequence
76   pair = GetUtf16FromUtf8(&ptr);
77   EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
78   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
79   EXPECT_ARRAY_POSITION(6, ptr, start);
80 
81   // Four byte sequence
82   pair = GetUtf16FromUtf8(&ptr);
83   EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
84   EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
85   EXPECT_ARRAY_POSITION(10, ptr, start);
86 
87   // Null terminator
88   pair = GetUtf16FromUtf8(&ptr);
89   EXPECT_EQ(0, GetLeadingUtf16Char(pair));
90   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
91   EXPECT_ARRAY_POSITION(11, ptr, start);
92 }
93 
TEST_F(UtfTest,GetUtf16FromUtf8_SurrogatesPassThrough)94 TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
95   const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
96   const char* ptr = start;
97   uint32_t pair = 0;
98 
99   pair = GetUtf16FromUtf8(&ptr);
100   EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
101   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
102   EXPECT_ARRAY_POSITION(3, ptr, start);
103 
104   pair = GetUtf16FromUtf8(&ptr);
105   EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
106   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
107   EXPECT_ARRAY_POSITION(6, ptr, start);
108 }
109 
TEST_F(UtfTest,CountModifiedUtf8Chars)110 TEST_F(UtfTest, CountModifiedUtf8Chars) {
111   EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
112   EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
113 }
114 
AssertConversion(const std::vector<uint16_t> input,const std::vector<uint8_t> expected)115 static void AssertConversion(const std::vector<uint16_t> input,
116                              const std::vector<uint8_t> expected) {
117   ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
118 
119   std::vector<uint8_t> output(expected.size());
120   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size());
121   EXPECT_EQ(expected, output);
122 }
123 
TEST_F(UtfTest,CountAndConvertUtf8Bytes)124 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
125   // Surrogate pairs will be converted into 4 byte sequences.
126   AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
127 
128   // Three byte encodings that are below & above the leading surrogate
129   // range respectively.
130   AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
131   AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
132   // Two byte encoding.
133   AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
134 
135   // Two byte special case : 0 must use an overlong encoding.
136   AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
137 
138   // One byte encoding.
139   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
140 
141   AssertConversion({
142       0xd802, 0xdc02,  // Surrogate pair
143       0xdef0, 0xdcff,  // Three byte encodings
144       0x0101, 0x0000,  // Two byte encodings
145       'p'   , 'p'      // One byte encoding
146     }, {
147       0xf0, 0x90, 0xa0, 0x82,
148       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
149       0xc4, 0x81, 0xc0, 0x80,
150       0x70, 0x70
151     });
152 }
153 
TEST_F(UtfTest,CountAndConvertUtf8Bytes_UnpairedSurrogate)154 TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
155   // Unpaired trailing surrogate at the end of input.
156   AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
157   // Unpaired (or incorrectly paired) surrogates in the middle of the input.
158   AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' });
159   AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' });
160   AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
161 }
162 
163 }  // namespace art
164