1 /* Copyright 2008,2009 Alain Knaff.
2 * This file is part of mtools.
3 *
4 * Mtools is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * Mtools is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Various character set conversions used by mtools
18 */
19 #include "sysincludes.h"
20 #include "msdos.h"
21 #include "mtools.h"
22
23 #include <stdio.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include "file_name.h"
27
28
29 #ifdef HAVE_ICONV_H
30 #include <iconv.h>
31
32 struct doscp_t {
33 iconv_t from;
34 iconv_t to;
35 };
36
37 static const char *wcharCp=NULL;
38
39 static const char* wcharTries[] = {
40 "WCHAR_T",
41 "UTF-32BE", "UTF-32LE",
42 "UTF-16BE", "UTF-16LE",
43 "UTF-32", "UTF-16",
44 "UCS-4BE", "UCS-4LE",
45 "UCS-2BE", "UCS-2LE",
46 "UCS-4", "UCS-2"
47 };
48
49 static const char *asciiTries[] = {
50 "ASCII", "ASCII-GR", "ISO8859-1"
51 };
52
53 static const wchar_t *testString = L"ab";
54
try(const char * testCp)55 static int try(const char *testCp) {
56 size_t res;
57 char *inbuf = (char *)testString;
58 size_t inbufLen = 2*sizeof(wchar_t);
59 char outbuf[3];
60 char *outbufP = outbuf;
61 size_t outbufLen = 2*sizeof(char);
62 iconv_t test = 0;
63 size_t i;
64
65 for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66 test = iconv_open(asciiTries[i], testCp);
67 if(test != (iconv_t) -1)
68 break;
69 }
70 if(test == (iconv_t) -1)
71 goto fail0;
72 res = iconv(test,
73 &inbuf, &inbufLen,
74 &outbufP, &outbufLen);
75 if(res != 0 || outbufLen != 0 || inbufLen != 0)
76 goto fail;
77 if(memcmp(outbuf, "ab", 2))
78 goto fail;
79 /* fprintf(stderr, "%s ok\n", testCp); */
80 return 1;
81 fail:
82 iconv_close(test);
83 fail0:
84 /*fprintf(stderr, "%s fail\n", testCp);*/
85 return 0;
86 }
87
getWcharCp(void)88 static const char *getWcharCp(void) {
89 unsigned int i;
90 if(wcharCp != NULL)
91 return wcharCp;
92 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93 if(try(wcharTries[i]))
94 return (wcharCp=wcharTries[i]);
95 }
96 fprintf(stderr, "No codepage found for wchar_t\n");
97 return NULL;
98 }
99
100
cp_open(int codepage)101 doscp_t *cp_open(int codepage)
102 {
103 char dosCp[17];
104 doscp_t *ret;
105 iconv_t from;
106 iconv_t to;
107
108 if(codepage == 0)
109 codepage = mtools_default_codepage;
110 if(codepage < 0 || codepage > 9999) {
111 fprintf(stderr, "Bad codepage %d\n", codepage);
112 return NULL;
113 }
114
115 if(getWcharCp() == NULL)
116 return NULL;
117
118 sprintf(dosCp, "CP%d", codepage);
119 from = iconv_open(wcharCp, dosCp);
120 if(from == (iconv_t)-1) {
121 fprintf(stderr, "Error converting to codepage %d %s\n",
122 codepage, strerror(errno));
123 return NULL;
124 }
125
126 sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127 to = iconv_open(dosCp, wcharCp);
128 if(to == (iconv_t)-1) {
129 /* Transliteration not supported? */
130 sprintf(dosCp, "CP%d", codepage);
131 to = iconv_open(dosCp, wcharCp);
132 }
133 if(to == (iconv_t)-1) {
134 iconv_close(from);
135 fprintf(stderr, "Error converting to codepage %d %s\n",
136 codepage, strerror(errno));
137 return NULL;
138 }
139
140 ret = New(doscp_t);
141 if(ret == NULL)
142 return ret;
143 ret->from = from;
144 ret->to = to;
145 return ret;
146 }
147
cp_close(doscp_t * cp)148 void cp_close(doscp_t *cp)
149 {
150 iconv_close(cp->to);
151 iconv_close(cp->from);
152 free(cp);
153 }
154
dos_to_wchar(doscp_t * cp,const char * dos,wchar_t * wchar,size_t len)155 int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
156 {
157 int r;
158 size_t in_len=len;
159 size_t out_len=len*sizeof(wchar_t);
160 wchar_t *dptr=wchar;
161 char *dos2 = (char *) dos; /* Magic to be able to call iconv with its
162 buggy prototype */
163 r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len);
164 if(r < 0)
165 return r;
166 *dptr = L'\0';
167 return dptr-wchar;
168 }
169
170 /**
171 * Converts len wide character to destination. Caller's responsibility to
172 * ensure that dest is large enough.
173 * mangled will be set if there has been an untranslatable character.
174 */
safe_iconv(iconv_t conv,const wchar_t * wchar,char * dest,size_t in_len,size_t out_len,int * mangled)175 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
176 size_t in_len, size_t out_len, int *mangled)
177 {
178 int r;
179 unsigned int i;
180 char *dptr = dest;
181 size_t len;
182
183 in_len=in_len*sizeof(wchar_t);
184
185 while(in_len > 0 && out_len > 0) {
186 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
187 if(r >= 0 || errno != EILSEQ) {
188 /* everything transformed, or error that is _not_ a bad
189 * character */
190 break;
191 }
192 *mangled |= 1;
193
194 if(out_len <= 0)
195 break;
196 if(dptr)
197 *dptr++ = '_';
198 in_len -= sizeof(wchar_t);
199
200 wchar++;
201 out_len--;
202 }
203
204 len = dptr-dest; /* how many dest characters have there been
205 generated */
206
207 /* eliminate question marks which might have been formed by
208 untransliterable characters */
209 for(i=0; i<len; i++) {
210 if(dest[i] == '?') {
211 dest[i] = '_';
212 *mangled |= 1;
213 }
214 }
215 return len;
216 }
217
wchar_to_dos(doscp_t * cp,wchar_t * wchar,char * dos,size_t len,int * mangled)218 void wchar_to_dos(doscp_t *cp,
219 wchar_t *wchar, char *dos, size_t len, int *mangled)
220 {
221 safe_iconv(cp->to, wchar, dos, len, len, mangled);
222 }
223
224 #else
225
226 #include "codepage.h"
227
228 struct doscp_t {
229 unsigned char *from_dos;
230 unsigned char to_dos[0x80];
231 };
232
cp_open(int codepage)233 doscp_t *cp_open(int codepage)
234 {
235 doscp_t *ret;
236 int i;
237 Codepage_t *cp;
238
239 if(codepage == 0)
240 codepage = 850;
241
242 ret = New(doscp_t);
243 if(ret == NULL)
244 return ret;
245
246 for(cp=codepages; cp->nr ; cp++)
247 if(cp->nr == codepage) {
248 ret->from_dos = cp->tounix;
249 break;
250 }
251
252 if(ret->from_dos == NULL) {
253 fprintf(stderr, "Bad codepage %d\n", codepage);
254 free(ret);
255 return NULL;
256 }
257
258 for(i=0; i<0x80; i++) {
259 char native = ret->from_dos[i];
260 if(! (native & 0x80))
261 continue;
262 ret->to_dos[native & 0x7f] = 0x80 | i;
263 }
264 return ret;
265 }
266
cp_close(doscp_t * cp)267 void cp_close(doscp_t *cp)
268 {
269 free(cp);
270 }
271
dos_to_wchar(doscp_t * cp,const char * dos,wchar_t * wchar,size_t len)272 int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
273 {
274 int i;
275
276 for(i=0; i<len && dos[i]; i++) {
277 char c = dos[i];
278 if(c >= ' ' && c <= '~')
279 wchar[i] = c;
280 else {
281 wchar[i] = cp->from_dos[c & 0x7f];
282 }
283 }
284 wchar[i] = '\0';
285 return i;
286 }
287
288
wchar_to_dos(doscp_t * cp,wchar_t * wchar,char * dos,size_t len,int * mangled)289 void wchar_to_dos(doscp_t *cp,
290 wchar_t *wchar, char *dos, size_t len, int *mangled)
291 {
292 int i;
293 for(i=0; i<len && wchar[i]; i++) {
294 char c = wchar[i];
295 if(c >= ' ' && c <= '~')
296 dos[i] = c;
297 else {
298 dos[i] = cp->to_dos[c & 0x7f];
299 if(dos[i] == '\0') {
300 dos[i]='_';
301 *mangled=1;
302 }
303 }
304 }
305 }
306
307 #endif
308
309
310 #ifndef HAVE_WCHAR_H
311
312 typedef int mbstate_t;
313
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)314 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
315 {
316 *s = wc;
317 return 1;
318 }
319
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)320 static inline size_t mbrtowc(wchar_t *pwc, const char *s,
321 size_t n, mbstate_t *ps)
322 {
323 *pwc = *s;
324 return 1;
325 }
326
327 #endif
328
329 #ifdef HAVE_ICONV_H
330
331 #include <langinfo.h>
332
333 static iconv_t to_native = NULL;
334
initialize_to_native(void)335 static void initialize_to_native(void)
336 {
337 char *li, *cp;
338 int len;
339 if(to_native != NULL)
340 return;
341 li = nl_langinfo(CODESET);
342 len = strlen(li) + 11;
343 if(getWcharCp() == NULL)
344 exit(1);
345 cp = safe_malloc(len);
346 strcpy(cp, li);
347 strcat(cp, "//TRANSLIT");
348 to_native = iconv_open(cp, wcharCp);
349 if(to_native == (iconv_t) -1)
350 to_native = iconv_open(li, wcharCp);
351 if(to_native == (iconv_t) -1)
352 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
353 free(cp);
354 if(to_native == (iconv_t) -1)
355 exit(1);
356 }
357
358
359
360 #endif
361
362
363 /**
364 * Convert wchar string to native, converting at most len wchar characters
365 * Returns number of generated native characters
366 */
wchar_to_native(const wchar_t * wchar,char * native,size_t len,size_t out_len)367 int wchar_to_native(const wchar_t *wchar, char *native, size_t len,
368 size_t out_len)
369 {
370 #ifdef HAVE_ICONV_H
371 int mangled;
372 int r;
373 initialize_to_native();
374 len = wcsnlen(wchar,len);
375 r=safe_iconv(to_native, wchar, native, len, out_len, &mangled);
376 native[r]='\0';
377 return r;
378 #else
379 int i;
380 char *dptr = native;
381 mbstate_t ps;
382 memset(&ps, 0, sizeof(ps));
383 for(i=0; i<len && wchar[i] != 0; i++) {
384 int r = wcrtomb(dptr, wchar[i], &ps);
385 if(r < 0 && errno == EILSEQ) {
386 r=1;
387 *dptr='_';
388 }
389 if(r < 0)
390 return r;
391 dptr+=r;
392 }
393 *dptr='\0';
394 return dptr-native;
395 #endif
396 }
397
398 /**
399 * Convert native string to wchar string, generating at most len wchar
400 * characters. If end is supplied, stop conversion when source pointer
401 * exceeds end. Returns number of generated wchars
402 */
native_to_wchar(const char * native,wchar_t * wchar,size_t len,const char * end,int * mangled)403 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
404 const char *end, int *mangled)
405 {
406 mbstate_t ps;
407 unsigned int i;
408 memset(&ps, 0, sizeof(ps));
409
410 for(i=0; i<len && (native < end || !end); i++) {
411 int r = mbrtowc(wchar+i, native, len, &ps);
412 if(r < 0) {
413 /* Unconvertible character. Just pretend it's Latin1
414 encoded (if valid Latin1 character) or substitute
415 with an underscore if not
416 */
417 char c = *native;
418 if(c >= '\xa0' && c < '\xff')
419 wchar[i] = c & 0xff;
420 else
421 wchar[i] = '_';
422 memset(&ps, 0, sizeof(ps));
423 r=1;
424 }
425 if(r == 0)
426 break;
427 native += r;
428 }
429 if(mangled && ((end && native < end) || (!end && *native && i == len)))
430 *mangled |= 3;
431 wchar[i]='\0';
432 return i;
433 }
434
435