1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *  * Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *  * Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <iconv.h>
30 
31 #include <ctype.h>
32 #include <endian.h>
33 #include <errno.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <uchar.h>
37 
38 #include "private/bionic_mbstate.h"
39 
40 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
41 
42 // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
43 // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
44 // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
45 enum Encoding {
46   US_ASCII,
47   UTF_8,
48   UTF_16_LE,
49   UTF_16_BE,
50   UTF_32_LE,
51   UTF_32_BE,
52   WCHAR_T,
53 };
54 
55 enum Mode {
56   ERROR,
57   IGNORE,
58   TRANSLIT,
59 };
60 
61 // This matching is strange but true.
62 // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
__match_encoding(const char * lhs,const char * rhs)63 static bool __match_encoding(const char* lhs, const char* rhs) {
64   while (*lhs && *rhs) {
65     // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
66     // Also implement the "delete each 0 that is not preceded by a digit" rule.
67     for (; *lhs; ++lhs) {
68       if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
69     }
70     // Case doesn't matter either.
71     if (tolower(*lhs) != tolower(*rhs)) break;
72     ++lhs;
73     ++rhs;
74   }
75   // As a special case we treat the GNU "//" extensions as end of string.
76   if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
77   return false;
78 }
79 
__parse_encoding(const char * s,Encoding * encoding,Mode * mode)80 static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
81   const char* suffix = strstr(s, "//");
82   if (suffix) {
83     if (!mode) return false;
84     if (strcmp(suffix, "//IGNORE") == 0) {
85       *mode = IGNORE;
86     } else if (strcmp(suffix, "//TRANSLIT") == 0) {
87       *mode = TRANSLIT;
88     } else {
89       return false;
90     }
91   }
92   if (__match_encoding(s, "utf8")) {
93     *encoding = UTF_8;
94   } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
95     *encoding = US_ASCII;
96   } else if (__match_encoding(s, "utf16le")) {
97     *encoding = UTF_16_LE;
98   } else if (__match_encoding(s, "utf16be")) {
99     *encoding = UTF_16_BE;
100   } else if (__match_encoding(s, "utf32le")) {
101     *encoding = UTF_32_LE;
102   } else if (__match_encoding(s, "utf32be")) {
103     *encoding = UTF_32_BE;
104   } else if (__match_encoding(s, "wchart")) {
105     *encoding = WCHAR_T;
106   } else {
107     return false;
108   }
109   return true;
110 }
111 
112 struct __iconv_t {
113   Encoding src_encoding;
114   Encoding dst_encoding;
115   Mode mode;
116 
__iconv_t__iconv_t117   __iconv_t() : mode(ERROR) {
118   }
119 
Convert__iconv_t120   int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
121     // Reset state.
122     wc = 0;
123     memset(&ps, 0, sizeof(ps));
124     replacement_count = 0;
125     ignored = false;
126     src_buf = src_buf0;
127     src_bytes_left = src_bytes_left0;
128     dst_buf = dst_buf0;
129     dst_bytes_left = dst_bytes_left0;
130 
131     while (*src_bytes_left > 0) {
132       if (!GetNext() || !Convert()) return -1;
133     }
134     return Done();
135   }
136 
137  private:
138   char32_t wc;
139   char buf[16];
140   size_t src_bytes_used;
141   size_t dst_bytes_used;
142   mbstate_t ps;
143 
144   size_t replacement_count;
145   bool ignored;
146 
147   char** src_buf;
148   size_t* src_bytes_left;
149   char** dst_buf;
150   size_t* dst_bytes_left;
151 
GetNext__iconv_t152   bool GetNext() {
153     errno = 0;
154     switch (src_encoding) {
155       case US_ASCII:
156         wc = **src_buf;
157         src_bytes_used = 1;
158         if (wc > 0x7f) errno = EILSEQ;
159         break;
160 
161       case UTF_8:
162         src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
163         if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
164           break;  // EILSEQ already set.
165         } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
166           errno = EINVAL;
167           return false;
168         }
169         break;
170 
171       case UTF_16_BE:
172       case UTF_16_LE: {
173         if (*src_bytes_left < 2) {
174           errno = EINVAL;
175           return false;
176         }
177         bool swap = (src_encoding == UTF_16_BE);
178         wc = In16(*src_buf, swap);
179         // 0xd800-0xdbff: high surrogates
180         // 0xdc00-0xdfff: low surrogates
181         if (wc >= 0xd800 && wc <= 0xdfff) {
182           if (wc >= 0xdc00) {  // Low surrogate before high surrogate.
183             errno = EILSEQ;
184             return false;
185           }
186           if (*src_bytes_left < 4) {
187             errno = EINVAL;
188             return false;
189           }
190           uint16_t hi = wc;
191           uint16_t lo = In16(*src_buf + 2, swap);
192           wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
193           src_bytes_used = 4;
194         }
195         break;
196       }
197 
198       case UTF_32_BE:
199       case UTF_32_LE:
200       case WCHAR_T:
201         if (*src_bytes_left < 4) {
202           errno = EINVAL;
203           return false;
204         }
205         wc = In32(*src_buf, (src_encoding == UTF_32_BE));
206         break;
207     }
208 
209     if (errno == EILSEQ) {
210       switch (mode) {
211         case ERROR:
212           return false;
213         case IGNORE:
214           *src_buf += src_bytes_used;
215           *src_bytes_left -= src_bytes_used;
216           ignored = true;
217           return GetNext();
218         case TRANSLIT:
219           wc = '?';
220           ++replacement_count;
221           return true;
222       }
223     }
224     return true;
225   }
226 
Convert__iconv_t227   bool Convert() {
228     errno = 0;
229     switch (dst_encoding) {
230       case US_ASCII:
231         buf[0] = wc;
232         dst_bytes_used = 1;
233         if (wc > 0x7f) errno = EILSEQ;
234         break;
235 
236       case UTF_8:
237         dst_bytes_used = c32rtomb(buf, wc, &ps);
238         if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
239           break;  // EILSEQ already set.
240         } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
241           errno = EINVAL;
242           return false;
243         }
244         break;
245 
246       case UTF_16_BE:
247       case UTF_16_LE: {
248         bool swap = (dst_encoding == UTF_16_BE);
249         if (wc < 0x10000) {  // BMP.
250           Out16(buf, wc, swap);
251         } else {  // Supplementary plane; output surrogate pair.
252           wc -= 0x10000;
253           char16_t hi = 0xd800 | (wc >> 10);
254           char16_t lo = 0xdc00 | (wc & 0x3ff);
255           Out16(buf + 0, hi, swap);
256           Out16(buf + 2, lo, swap);
257           dst_bytes_used = 4;
258         }
259       } break;
260 
261       case UTF_32_BE:
262       case UTF_32_LE:
263       case WCHAR_T:
264         Out32(wc, (dst_encoding == UTF_32_BE));
265         break;
266     }
267 
268     if (errno == EILSEQ) {
269       if (mode == IGNORE) {
270         *src_buf += src_bytes_used;
271         *src_bytes_left -= src_bytes_used;
272         ignored = true;
273         return true;
274       } else if (mode == TRANSLIT) {
275         wc = '?';
276         ++replacement_count;
277         return Convert();
278       }
279       return false;
280     }
281 
282     return Emit();
283   }
284 
In16__iconv_t285   uint16_t In16(const char* buf, bool swap) {
286     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
287     uint16_t wc = (src[0]) | (src[1] << 8);
288     if (swap) wc = __swap16(wc);
289     src_bytes_used = 2;
290     return wc;
291   }
292 
In32__iconv_t293   uint32_t In32(const char* buf, bool swap) {
294     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
295     uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
296     if (swap) wc = __swap32(wc);
297     src_bytes_used = 4;
298     return wc;
299   }
300 
Out16__iconv_t301   void Out16(char* dst, char16_t ch, bool swap) {
302     if (swap) ch = __swap16(ch);
303     dst[0] = ch;
304     dst[1] = ch >> 8;
305     dst_bytes_used = 2;
306   }
307 
Out32__iconv_t308   void Out32(char32_t ch, bool swap) {
309     if (swap) ch = __swap32(ch);
310     buf[0] = ch;
311     buf[1] = ch >> 8;
312     buf[2] = ch >> 16;
313     buf[3] = ch >> 24;
314     dst_bytes_used = 4;
315   }
316 
Emit__iconv_t317   bool Emit() {
318     if (dst_bytes_used > *dst_bytes_left) {
319       errno = E2BIG;
320       return false;
321     }
322 
323     memcpy(*dst_buf, buf, dst_bytes_used);
324     *src_buf += src_bytes_used;
325     *src_bytes_left -= src_bytes_used;
326     *dst_buf += dst_bytes_used;
327     *dst_bytes_left -= dst_bytes_used;
328     return true;
329   }
330 
Done__iconv_t331   int Done() {
332     if (mode == TRANSLIT) return replacement_count;
333     if (ignored) {
334       errno = EILSEQ;
335       return -1;
336     }
337     return 0;
338   }
339 };
340 
iconv_open(const char * __dst_encoding,const char * __src_encoding)341 iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
342   iconv_t result = new __iconv_t;
343   if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
344       !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
345     delete result;
346     errno = EINVAL;
347     return INVALID_ICONV_T;
348   }
349   return result;
350 }
351 
iconv(iconv_t __converter,char ** __src_buf,size_t * __src_bytes_left,char ** __dst_buf,size_t * __dst_bytes_left)352 size_t iconv(iconv_t __converter,
353              char** __src_buf, size_t* __src_bytes_left,
354              char** __dst_buf, size_t* __dst_bytes_left) {
355   if (__converter == INVALID_ICONV_T) {
356     errno = EBADF;
357     return -1;
358   }
359   return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
360 }
361 
iconv_close(iconv_t __converter)362 int iconv_close(iconv_t __converter) {
363   if (__converter == INVALID_ICONV_T) {
364     errno = EBADF;
365     return -1;
366   }
367   delete __converter;
368   return 0;
369 }
370