1 /*	$OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2 
3 /*-
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <errno.h>
30 #include <sys/param.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <uchar.h>
34 
35 #include "private/bionic_mbstate.h"
36 
37 //
38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40 // mbstate_t was only 4 bytes.
41 //
42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43 // mbstate_t already has enough space (out of the 4 available bytes we only
44 // need 3 since we should never need to store the entire sequence in the
45 // intermediary state).
46 //
47 // The C standard leaves the conversion state undefined after a bad conversion.
48 // To avoid unexpected failures due to the possible use of the internal private
49 // state we always reset the conversion state when encountering illegal
50 // sequences.
51 //
52 // We also implement the POSIX interface directly rather than being accessed via
53 // function pointers.
54 //
55 
mbsinit(const mbstate_t * ps)56 int mbsinit(const mbstate_t* ps) {
57   return (ps == nullptr || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
58 }
59 
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61   static mbstate_t __private_state;
62   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
63 
64   // Our wchar_t is UTF-32.
65   return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
66 }
67 
mbsnrtowcs(wchar_t * dst,const char ** src,size_t nmc,size_t len,mbstate_t * ps)68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
69   static mbstate_t __private_state;
70   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
71   size_t i, o, r;
72 
73   // The fast paths in the loops below are not safe if an ASCII
74   // character appears as anything but the first byte of a
75   // multibyte sequence. Check now to avoid doing it in the loops.
76   if (nmc > 0 && mbstate_bytes_so_far(state) > 0 && static_cast<uint8_t>((*src)[0]) < 0x80) {
77     return mbstate_reset_and_return_illegal(EILSEQ, state);
78   }
79 
80   // Measure only?
81   if (dst == nullptr) {
82     for (i = o = 0; i < nmc; i += r, o++) {
83       if (static_cast<uint8_t>((*src)[i]) < 0x80) {
84         // Fast path for plain ASCII characters.
85         if ((*src)[i] == '\0') {
86           return mbstate_reset_and_return(o, state);
87         }
88         r = 1;
89       } else {
90         r = mbrtowc(nullptr, *src + i, nmc - i, state);
91         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
92           return mbstate_reset_and_return_illegal(EILSEQ, state);
93         }
94         if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
95           return mbstate_reset_and_return_illegal(EILSEQ, state);
96         }
97         if (r == 0) {
98           return mbstate_reset_and_return(o, state);
99         }
100       }
101     }
102     return mbstate_reset_and_return(o, state);
103   }
104 
105   // Actually convert, updating `dst` and `src`.
106   for (i = o = 0; i < nmc && o < len; i += r, o++) {
107     if (static_cast<uint8_t>((*src)[i]) < 0x80) {
108       // Fast path for plain ASCII characters.
109       dst[o] = (*src)[i];
110       r = 1;
111       if ((*src)[i] == '\0') {
112         *src = nullptr;
113         return mbstate_reset_and_return(o, state);
114       }
115     } else {
116       r = mbrtowc(dst + o, *src + i, nmc - i, state);
117       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
118         *src += i;
119         return mbstate_reset_and_return_illegal(EILSEQ, state);
120       }
121       if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
122         *src += nmc;
123         return mbstate_reset_and_return_illegal(EILSEQ, state);
124       }
125       if (r == 0) {
126         *src = nullptr;
127         return mbstate_reset_and_return(o, state);
128       }
129     }
130   }
131   *src += i;
132   return mbstate_reset_and_return(o, state);
133 }
134 
mbsrtowcs(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps)135 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
136   return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
137 }
138 
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)139 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
140   static mbstate_t __private_state;
141   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
142 
143   // Our wchar_t is UTF-32.
144   return c32rtomb(s, static_cast<char32_t>(wc), state);
145 }
146 
wcsnrtombs(char * dst,const wchar_t ** src,size_t nwc,size_t len,mbstate_t * ps)147 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
148   static mbstate_t __private_state;
149   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
150 
151   if (!mbsinit(state)) {
152     return mbstate_reset_and_return_illegal(EILSEQ, state);
153   }
154 
155   char buf[MB_LEN_MAX];
156   size_t i, o, r;
157   if (dst == nullptr) {
158     for (i = o = 0; i < nwc; i++, o += r) {
159       wchar_t wc = (*src)[i];
160       if (static_cast<uint32_t>(wc) < 0x80) {
161         // Fast path for plain ASCII characters.
162         if (wc == 0) {
163           return o;
164         }
165         r = 1;
166       } else {
167         r = wcrtomb(buf, wc, state);
168         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
169           return r;
170         }
171       }
172     }
173     return o;
174   }
175 
176   for (i = o = 0; i < nwc && o < len; i++, o += r) {
177     wchar_t wc = (*src)[i];
178     if (static_cast<uint32_t>(wc) < 0x80) {
179       // Fast path for plain ASCII characters.
180       dst[o] = wc;
181       if (wc == 0) {
182         *src = nullptr;
183         return o;
184       }
185       r = 1;
186     } else if (len - o >= sizeof(buf)) {
187       // Enough space to translate in-place.
188       r = wcrtomb(dst + o, wc, state);
189       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
190         *src += i;
191         return r;
192       }
193     } else {
194       // May not be enough space; use temp buffer.
195       r = wcrtomb(buf, wc, state);
196       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
197         *src += i;
198         return r;
199       }
200       if (r > len - o) {
201         break;
202       }
203       memcpy(dst + o, buf, r);
204     }
205   }
206   *src += i;
207   return o;
208 }
209 
wcsrtombs(char * dst,const wchar_t ** src,size_t len,mbstate_t * ps)210 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
211   return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
212 }
213