1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <gtest/gtest.h>
18 
19 #include <iconv.h>
20 
21 #include "utils.h"
22 
23 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
24 
TEST(iconv,iconv_open_EINVAL)25 TEST(iconv, iconv_open_EINVAL) {
26   errno = 0;
27   ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
28   ASSERT_ERRNO(EINVAL);
29   errno = 0;
30   ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
31   ASSERT_ERRNO(EINVAL);
32   errno = 0;
33   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
34   ASSERT_ERRNO(EINVAL);
35 }
36 
TEST(iconv,iconv_open_comparator)37 TEST(iconv, iconv_open_comparator) {
38   // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
39   // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
40   iconv_t c;
41   ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
42   ASSERT_EQ(0, iconv_close(c));
43   ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
44   ASSERT_EQ(0, iconv_close(c));
45 
46   // "...but not "utf-80" or "ut8"."
47   errno = 0;
48   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
49   ASSERT_ERRNO(EINVAL);
50   errno = 0;
51   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
52   ASSERT_ERRNO(EINVAL);
53 }
54 
TEST(iconv,iconv_smoke)55 TEST(iconv, iconv_smoke) {
56   const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
57   char buf[BUFSIZ] = {};
58 
59   iconv_t c = iconv_open("UTF-32LE", "UTF-8");
60   ASSERT_NE(INVALID_ICONV_T, c);
61 
62   char* in = const_cast<char*>(utf8);
63   size_t in_bytes = strlen(in);
64 
65   char* out = buf;
66   size_t out_bytes = sizeof(buf);
67 
68   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
69 
70   wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
71   EXPECT_EQ(L'a', utf16[0]);
72   EXPECT_EQ(L'٦', utf16[1]);
73   EXPECT_EQ(L'ᄀ', utf16[2]);
74   EXPECT_EQ(L'\0', utf16[3]);
75   EXPECT_EQ(0U, in_bytes);
76   EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
77 
78   ASSERT_EQ(0, iconv_close(c));
79 }
80 
TEST(iconv,iconv_lossy_TRANSLIT)81 TEST(iconv, iconv_lossy_TRANSLIT) {
82   const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
83   char buf[BUFSIZ] = {};
84 
85   iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
86   ASSERT_NE(INVALID_ICONV_T, c);
87 
88   char* in = const_cast<char*>(utf8);
89   size_t in_bytes = strlen(in);
90 
91   char* out = buf;
92   size_t out_bytes = sizeof(buf);
93 
94   // Two of the input characters (5 input bytes) aren't representable as ASCII.
95   // With "//TRANSLIT", we use a replacement character, and report the number
96   // of replacements.
97   EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
98 
99   EXPECT_EQ('a', buf[0]);
100   EXPECT_EQ('?', buf[1]);
101   EXPECT_EQ('?', buf[2]);
102   EXPECT_EQ('z', buf[3]);
103   EXPECT_EQ(0, buf[4]);
104   EXPECT_EQ(0U, in_bytes);
105   EXPECT_EQ(sizeof(buf) - 4, out_bytes);
106 
107   ASSERT_EQ(0, iconv_close(c));
108 }
109 
TEST(iconv,iconv_lossy_IGNORE)110 TEST(iconv, iconv_lossy_IGNORE) {
111   const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
112   char buf[BUFSIZ] = {};
113 
114   iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
115   ASSERT_NE(INVALID_ICONV_T, c);
116 
117   char* in = const_cast<char*>(utf8);
118   size_t in_bytes = strlen(in);
119 
120   char* out = buf;
121   size_t out_bytes = sizeof(buf);
122 
123   // Two of the input characters (5 input bytes) aren't representable as ASCII.
124   // With "//IGNORE", we just skip them (but return failure).
125   errno = 0;
126   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
127   EXPECT_ERRNO(EILSEQ);
128 
129   EXPECT_EQ('a', buf[0]);
130   EXPECT_EQ('z', buf[1]);
131   EXPECT_EQ(0, buf[2]);
132   EXPECT_EQ(0U, in_bytes);
133   EXPECT_EQ(sizeof(buf) - 2, out_bytes);
134 
135   ASSERT_EQ(0, iconv_close(c));
136 }
137 
TEST(iconv,iconv_lossy)138 TEST(iconv, iconv_lossy) {
139   const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
140   char buf[BUFSIZ] = {};
141 
142   iconv_t c = iconv_open("ASCII", "UTF-8");
143   ASSERT_NE(INVALID_ICONV_T, c);
144 
145   char* in = const_cast<char*>(utf8);
146   size_t in_bytes = strlen(in);
147 
148   char* out = buf;
149   size_t out_bytes = sizeof(buf);
150 
151   // The second input character isn't representable as ASCII, so we stop there.
152   errno = 0;
153   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
154   EXPECT_ERRNO(EILSEQ);
155 
156   EXPECT_EQ('a', buf[0]);
157   EXPECT_EQ(0, buf[1]);
158   EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z.
159   EXPECT_EQ(sizeof(buf) - 1, out_bytes);
160 
161   ASSERT_EQ(0, iconv_close(c));
162 }
163 
TEST(iconv,iconv_malformed_sequence_EILSEQ)164 TEST(iconv, iconv_malformed_sequence_EILSEQ) {
165   const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
166   char buf[BUFSIZ] = {};
167 
168   iconv_t c = iconv_open("UTF-8", "UTF-8");
169   ASSERT_NE(INVALID_ICONV_T, c);
170 
171   char* in = const_cast<char*>(utf8);
172   size_t in_bytes = strlen(in);
173 
174   char* out = buf;
175   size_t out_bytes = sizeof(buf);
176 
177   // The second input byte is a malformed character, so we stop there.
178   errno = 0;
179   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
180   EXPECT_ERRNO(EILSEQ);
181   EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
182   ++in;
183   --in_bytes;
184   errno = 0;
185   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
186   EXPECT_ERRNO(0);
187 
188   EXPECT_EQ('a', buf[0]);
189   EXPECT_EQ('z', buf[1]);
190   EXPECT_EQ(0, buf[2]);
191   EXPECT_EQ(0U, in_bytes);
192   EXPECT_EQ(sizeof(buf) - 2, out_bytes);
193 
194   ASSERT_EQ(0, iconv_close(c));
195 }
196 
TEST(iconv,iconv_incomplete_sequence_EINVAL)197 TEST(iconv, iconv_incomplete_sequence_EINVAL) {
198   const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
199   char buf[BUFSIZ] = {};
200 
201   iconv_t c = iconv_open("UTF-8", "UTF-8");
202   ASSERT_NE(INVALID_ICONV_T, c);
203 
204   char* in = const_cast<char*>(utf8);
205   size_t in_bytes = strlen(in);
206 
207   char* out = buf;
208   size_t out_bytes = sizeof(buf);
209 
210   // The second input byte is just the start of a character, and we don't have any more bytes.
211   errno = 0;
212   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
213   EXPECT_ERRNO(EINVAL);
214   EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
215 
216   EXPECT_EQ('a', buf[0]);
217   EXPECT_EQ(0, buf[1]);
218   EXPECT_EQ(1U, in_bytes);
219   EXPECT_EQ(sizeof(buf) - 1, out_bytes);
220 
221   ASSERT_EQ(0, iconv_close(c));
222 }
223 
TEST(iconv,iconv_E2BIG)224 TEST(iconv, iconv_E2BIG) {
225   const char* utf8 = "abc";
226   char buf[BUFSIZ] = {};
227 
228   iconv_t c = iconv_open("UTF-8", "UTF-8");
229   ASSERT_NE(INVALID_ICONV_T, c);
230 
231   char* in = const_cast<char*>(utf8);
232   size_t in_bytes = strlen(in);
233 
234   char* out = buf;
235   size_t out_bytes = 1;
236 
237   // We need three bytes, so one isn't enough (but we will make progress).
238   out_bytes = 1;
239   errno = 0;
240   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
241   EXPECT_ERRNO(E2BIG);
242   EXPECT_EQ(2U, in_bytes);
243   EXPECT_EQ(0U, out_bytes);
244 
245   // Two bytes left, so zero isn't enough (and we can't even make progress).
246   out_bytes = 0;
247   errno = 0;
248   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
249   EXPECT_ERRNO(E2BIG);
250   EXPECT_EQ(2U, in_bytes);
251   EXPECT_EQ(0U, out_bytes);
252 
253   // Two bytes left, so one isn't enough (but we will make progress).
254   out_bytes = 1;
255   errno = 0;
256   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
257   EXPECT_ERRNO(E2BIG);
258   EXPECT_EQ(1U, in_bytes);
259   EXPECT_EQ(0U, out_bytes);
260 
261   // One byte left, so one byte is now enough.
262   out_bytes = 1;
263   errno = 0;
264   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
265   EXPECT_ERRNO(0);
266   EXPECT_EQ(0U, in_bytes);
267   EXPECT_EQ(0U, out_bytes);
268 
269   EXPECT_EQ('a', buf[0]);
270   EXPECT_EQ('b', buf[1]);
271   EXPECT_EQ('c', buf[2]);
272   EXPECT_EQ(0, buf[3]);
273 
274   ASSERT_EQ(0, iconv_close(c));
275 }
276 
TEST(iconv,iconv_invalid_converter_EBADF)277 TEST(iconv, iconv_invalid_converter_EBADF) {
278   char* in = nullptr;
279   char* out = nullptr;
280   size_t in_bytes = 0;
281   size_t out_bytes = 0;
282   errno = 0;
283   ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
284   ASSERT_ERRNO(EBADF);
285 }
286 
TEST(iconv,iconv_close_invalid_converter_EBADF)287 TEST(iconv, iconv_close_invalid_converter_EBADF) {
288   errno = 0;
289   ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
290   ASSERT_ERRNO(EBADF);
291 }
292 
RoundTrip(const char * dst_enc,const char * expected_bytes,size_t n)293 static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
294   // Examples from https://en.wikipedia.org/wiki/UTF-16.
295   const char* utf8 = "$€��"; // U+0024, U+20AC, U+10437.
296 
297   iconv_t c = iconv_open(dst_enc, "UTF-8");
298   ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
299 
300   char* in = const_cast<char*>(utf8);
301   size_t in_bytes = strlen(utf8);
302   char buf[BUFSIZ] = {};
303   char* out = buf;
304   size_t out_bytes = sizeof(buf);
305   size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
306 
307   // Check we got the bytes we were expecting.
308   for (size_t i = 0; i < n; ++i) {
309     EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
310   }
311 
312   ASSERT_EQ(0, iconv_close(c));
313 
314   // We can't round-trip if there were replacements.
315   if (strstr(dst_enc, "ascii")) {
316     GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
317     return;
318   }
319   ASSERT_EQ(0U, replacement_count);
320 
321   c = iconv_open("UTF-8", dst_enc);
322   ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
323 
324   in = buf;
325   in_bytes = n;
326   char buf2[BUFSIZ] = {};
327   out = buf2;
328   out_bytes = sizeof(buf2);
329   iconv(c, &in, &in_bytes, &out, &out_bytes);
330 
331   ASSERT_STREQ(utf8, buf2) << dst_enc;
332 
333   ASSERT_EQ(0, iconv_close(c));
334 }
335 
TEST(iconv,iconv_round_trip_ascii)336 TEST(iconv, iconv_round_trip_ascii) {
337   RoundTrip("ascii//TRANSLIT", "$??", 3);
338 }
339 
TEST(iconv,iconv_round_trip_utf8)340 TEST(iconv, iconv_round_trip_utf8) {
341   RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
342 }
343 
TEST(iconv,iconv_round_trip_utf16be)344 TEST(iconv, iconv_round_trip_utf16be) {
345   RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
346 }
347 
TEST(iconv,iconv_round_trip_utf16le)348 TEST(iconv, iconv_round_trip_utf16le) {
349   RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
350 }
351 
TEST(iconv,iconv_round_trip_utf32be)352 TEST(iconv, iconv_round_trip_utf32be) {
353   RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
354 }
355 
TEST(iconv,iconv_round_trip_utf32le)356 TEST(iconv, iconv_round_trip_utf32le) {
357   RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
358 }
359 
TEST(iconv,iconv_round_trip_wchar_t)360 TEST(iconv, iconv_round_trip_wchar_t) {
361   RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
362 }
363 
Check(int expected_errno,const char * src_enc,const char * src,size_t n)364 static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
365   iconv_t c = iconv_open("wchar_t", src_enc);
366   char* in = const_cast<char*>(src);
367   size_t in_bytes = n;
368   wchar_t out_buf[16];
369   size_t out_bytes = sizeof(out_buf);
370   char* out = reinterpret_cast<char*>(out_buf);
371   errno = 0;
372   ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
373   EXPECT_ERRNO(expected_errno);
374   EXPECT_EQ(0, iconv_close(c));
375 }
376 
TEST(iconv,iconv_EILSEQ_ascii)377 TEST(iconv, iconv_EILSEQ_ascii) {
378   Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
379 }
380 
TEST(iconv,iconv_EILSEQ_utf8_initial)381 TEST(iconv, iconv_EILSEQ_utf8_initial) {
382   Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
383 }
384 
TEST(iconv,iconv_EILSEQ_utf8_non_initial)385 TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
386   Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
387 }
388 
TEST(iconv,iconv_EILSEQ_utf16be_low_surrogate_first)389 TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
390   Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
391 }
392 
TEST(iconv,iconv_EILSEQ_utf16le_low_surrogate_first)393 TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
394   Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
395 }
396 
TEST(iconv,iconv_EINVAL_utf8_short)397 TEST(iconv, iconv_EINVAL_utf8_short) {
398   Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
399 }
400 
TEST(iconv,iconv_EINVAL_utf16be_short)401 TEST(iconv, iconv_EINVAL_utf16be_short) {
402   Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
403 }
404 
TEST(iconv,iconv_EINVAL_utf16be_missing_low_surrogate)405 TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
406   Check(EINVAL, "utf16be", "\xd8\x01", 2);
407 }
408 
TEST(iconv,iconv_EINVAL_utf16be_half_low_surrogate)409 TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
410   Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
411 }
412 
TEST(iconv,iconv_EINVAL_utf16le_short)413 TEST(iconv, iconv_EINVAL_utf16le_short) {
414   Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
415 }
416 
TEST(iconv,iconv_EINVAL_utf16le_missing_low_surrogate)417 TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
418   Check(EINVAL, "utf16le", "\x01\xd8", 2);
419 }
420 
TEST(iconv,iconv_EINVAL_utf16le_half_low_surrogate)421 TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
422   Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
423 }
424 
TEST(iconv,iconv_EINVAL_utf32be_short)425 TEST(iconv, iconv_EINVAL_utf32be_short) {
426   Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
427 }
428 
TEST(iconv,iconv_EINVAL_utf32le_short)429 TEST(iconv, iconv_EINVAL_utf32le_short) {
430   Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
431 }
432 
TEST(iconv,iconv_initial_shift_state)433 TEST(iconv, iconv_initial_shift_state) {
434   // POSIX: "For state-dependent encodings, the conversion descriptor
435   // cd is placed into its initial shift state by a call for which inbuf
436   // is a null pointer, or for which inbuf points to a null pointer."
437   iconv_t c = iconv_open("utf8", "utf8");
438   char* in = nullptr;
439   size_t in_bytes = 0;
440   wchar_t out_buf[16];
441   size_t out_bytes = sizeof(out_buf);
442   char* out = reinterpret_cast<char*>(out_buf);
443 
444   // Points to a null pointer...
445   errno = 0;
446   ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes));
447   EXPECT_ERRNO(0);
448   EXPECT_EQ(sizeof(out_buf), out_bytes);
449 
450   // Is a null pointer...
451   errno = 0;
452   ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes));
453   EXPECT_ERRNO(0);
454   EXPECT_EQ(sizeof(out_buf), out_bytes);
455 
456   // Is a null pointer and so is in_bytes. This isn't specified by POSIX, but
457   // glibc and macOS both allow that, where Android historically didn't.
458   // https://issuetracker.google.com/180598400
459   errno = 0;
460   ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes));
461   EXPECT_ERRNO(0);
462   EXPECT_EQ(sizeof(out_buf), out_bytes);
463 
464   EXPECT_EQ(0, iconv_close(c));
465 }
466