1 /*	$OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2 
3 /*-
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <errno.h>
30 #include <sys/param.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <uchar.h>
34 
35 #include "private/bionic_mbstate.h"
36 
37 //
38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40 // mbstate_t was only 4 bytes.
41 //
42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43 // mbstate_t already has enough space (out of the 4 available bytes we only
44 // need 3 since we should never need to store the entire sequence in the
45 // intermediary state).
46 //
47 // The C standard leaves the conversion state undefined after a bad conversion.
48 // To avoid unexpected failures due to the possible use of the internal private
49 // state we always reset the conversion state when encountering illegal
50 // sequences.
51 //
52 // We also implement the POSIX interface directly rather than being accessed via
53 // function pointers.
54 //
55 
mbsinit(const mbstate_t * ps)56 int mbsinit(const mbstate_t* ps) {
57   return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
58 }
59 
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61   static mbstate_t __private_state;
62   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
63 
64   // Our wchar_t is UTF-32
65   return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
66 }
67 
mbsnrtowcs(wchar_t * dst,const char ** src,size_t nmc,size_t len,mbstate_t * ps)68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
69   static mbstate_t __private_state;
70   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
71   size_t i, o, r;
72 
73   if (dst == NULL) {
74     /*
75      * The fast path in the loop below is not safe if an ASCII
76      * character appears as anything but the first byte of a
77      * multibyte sequence. Check now to avoid doing it in the loop.
78      */
79     if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
80         && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
81       return reset_and_return_illegal(EILSEQ, state);
82     }
83     for (i = o = 0; i < nmc; i += r, o++) {
84       if (static_cast<uint8_t>((*src)[i]) < 0x80) {
85         // Fast path for plain ASCII characters.
86         if ((*src)[i] == '\0') {
87           *src = nullptr;
88           return reset_and_return(o, state);
89         }
90         r = 1;
91       } else {
92         r = mbrtowc(NULL, *src + i, nmc - i, state);
93         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
94           return reset_and_return_illegal(EILSEQ, state);
95         }
96         if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
97           return reset_and_return_illegal(EILSEQ, state);
98         }
99         if (r == 0) {
100           *src = nullptr;
101           return reset_and_return(o, state);
102         }
103       }
104     }
105     return reset_and_return(o, state);
106   }
107 
108   /*
109    * The fast path in the loop below is not safe if an ASCII
110    * character appears as anything but the first byte of a
111    * multibyte sequence. Check now to avoid doing it in the loop.
112    */
113   if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
114       && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
115     return reset_and_return_illegal(EILSEQ, state);
116   }
117   for (i = o = 0; i < nmc && o < len; i += r, o++) {
118     if (static_cast<uint8_t>((*src)[i]) < 0x80) {
119       // Fast path for plain ASCII characters.
120       dst[o] = (*src)[i];
121       r = 1;
122       if ((*src)[i] == '\0') {
123         *src = nullptr;
124         return reset_and_return(o, state);
125       }
126     } else {
127       r = mbrtowc(dst + o, *src + i, nmc - i, state);
128       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
129         *src += i;
130         return reset_and_return_illegal(EILSEQ, state);
131       }
132       if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
133         *src += nmc;
134         return reset_and_return(EILSEQ, state);
135       }
136       if (r == 0) {
137         *src = NULL;
138         return reset_and_return(o, state);
139       }
140     }
141   }
142   *src += i;
143   return reset_and_return(o, state);
144 }
145 
mbsrtowcs(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps)146 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
147   return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
148 }
149 
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)150 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
151   static mbstate_t __private_state;
152   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
153 
154   // Our wchar_t is UTF-32
155   return c32rtomb(s, static_cast<char32_t>(wc), state);
156 }
157 
wcsnrtombs(char * dst,const wchar_t ** src,size_t nwc,size_t len,mbstate_t * ps)158 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
159   static mbstate_t __private_state;
160   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
161 
162   if (!mbsinit(state)) {
163     return reset_and_return_illegal(EILSEQ, state);
164   }
165 
166   char buf[MB_LEN_MAX];
167   size_t i, o, r;
168   if (dst == NULL) {
169     for (i = o = 0; i < nwc; i++, o += r) {
170       wchar_t wc = (*src)[i];
171       if (static_cast<uint32_t>(wc) < 0x80) {
172         // Fast path for plain ASCII characters.
173         if (wc == 0) {
174           return o;
175         }
176         r = 1;
177       } else {
178         r = wcrtomb(buf, wc, state);
179         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
180           return r;
181         }
182       }
183     }
184     return o;
185   }
186 
187   for (i = o = 0; i < nwc && o < len; i++, o += r) {
188     wchar_t wc = (*src)[i];
189     if (static_cast<uint32_t>(wc) < 0x80) {
190       // Fast path for plain ASCII characters.
191       dst[o] = wc;
192       if (wc == 0) {
193         *src = NULL;
194         return o;
195       }
196       r = 1;
197     } else if (len - o >= sizeof(buf)) {
198       // Enough space to translate in-place.
199       r = wcrtomb(dst + o, wc, state);
200       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
201         *src += i;
202         return r;
203       }
204     } else {
205       // May not be enough space; use temp buffer.
206       r = wcrtomb(buf, wc, state);
207       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
208         *src += i;
209         return r;
210       }
211       if (r > len - o) {
212         break;
213       }
214       memcpy(dst + o, buf, r);
215     }
216   }
217   *src += i;
218   return o;
219 }
220 
wcsrtombs(char * dst,const wchar_t ** src,size_t len,mbstate_t * ps)221 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
222   return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
223 }
224 
wcscoll_l(const wchar_t * ws1,const wchar_t * ws2,locale_t)225 int wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t) {
226   return wcscoll(ws1, ws2);
227 }
228 
wcsxfrm_l(wchar_t * dest,const wchar_t * src,size_t n,locale_t)229 size_t wcsxfrm_l(wchar_t *dest, const wchar_t *src, size_t n, locale_t) {
230   return wcsxfrm(dest, src, n);
231 }
232 
wcstoll_l(const wchar_t * nptr,wchar_t ** endptr,int base,locale_t)233 long long wcstoll_l(const wchar_t *nptr, wchar_t **endptr, int base,
234                     locale_t) {
235   return wcstoll(nptr, endptr, base);
236 }
237 
wcstoull_l(const wchar_t * nptr,wchar_t ** endptr,int base,locale_t)238 unsigned long long wcstoull_l(const wchar_t *nptr, wchar_t **endptr,
239                               int base, locale_t) {
240   return wcstoull(nptr, endptr, base);
241 }
242 
wcstold_l(const wchar_t * nptr,wchar_t ** endptr,locale_t)243 long double wcstold_l(const wchar_t *nptr, wchar_t **endptr, locale_t) {
244   return wcstold(nptr, endptr);
245 }
246