1 /*  Copyright 2008,2009 Alain Knaff.
2  *  This file is part of mtools.
3  *
4  *  Mtools is free software: you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation, either version 3 of the License, or
7  *  (at your option) any later version.
8  *
9  *  Mtools is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with Mtools.  If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Various character set conversions used by mtools
18  */
19 #include "sysincludes.h"
20 #include "msdos.h"
21 #include "mtools.h"
22 
23 #include <stdio.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include "file_name.h"
27 
28 
29 #ifdef HAVE_ICONV_H
30 #include <iconv.h>
31 
32 struct doscp_t {
33 	iconv_t from;
34 	iconv_t to;
35 };
36 
37 static const char *wcharCp=NULL;
38 
39 static const char* wcharTries[] = {
40 	"WCHAR_T",
41 	"UTF-32BE", "UTF-32LE",
42 	"UTF-16BE", "UTF-16LE",
43 	"UTF-32", "UTF-16",
44 	"UCS-4BE", "UCS-4LE",
45 	"UCS-2BE", "UCS-2LE",
46 	"UCS-4", "UCS-2"
47 };
48 
49 static const char *asciiTries[] = {
50 	"ASCII", "ASCII-GR", "ISO8859-1"
51 };
52 
53 static const wchar_t *testString = L"ab";
54 
try(const char * testCp)55 static int try(const char *testCp) {
56 	size_t res;
57 	char *inbuf = (char *)testString;
58 	size_t inbufLen = 2*sizeof(wchar_t);
59 	char outbuf[3];
60 	char *outbufP = outbuf;
61 	size_t outbufLen = 2*sizeof(char);
62 	iconv_t test = 0;
63 	size_t i;
64 
65 	for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66 		test = iconv_open(asciiTries[i], testCp);
67 		if(test != (iconv_t) -1)
68 			break;
69 	}
70 	if(test == (iconv_t) -1)
71 		goto fail0;
72 	res = iconv(test,
73 		    &inbuf, &inbufLen,
74 		    &outbufP, &outbufLen);
75 	if(res != 0 || outbufLen != 0 || inbufLen != 0)
76 		goto fail;
77 	if(memcmp(outbuf, "ab", 2))
78 		goto fail;
79 	/* fprintf(stderr, "%s ok\n", testCp); */
80 	return 1;
81  fail:
82 	iconv_close(test);
83  fail0:
84 	/*fprintf(stderr, "%s fail\n", testCp);*/
85 	return 0;
86 }
87 
getWcharCp(void)88 static const char *getWcharCp(void) {
89 	unsigned int i;
90 	if(wcharCp != NULL)
91 		return wcharCp;
92 	for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93 		if(try(wcharTries[i]))
94 			return (wcharCp=wcharTries[i]);
95 	}
96 	fprintf(stderr, "No codepage found for wchar_t\n");
97 	return NULL;
98 }
99 
100 
cp_open(int codepage)101 doscp_t *cp_open(int codepage)
102 {
103 	char dosCp[17];
104 	doscp_t *ret;
105 	iconv_t from;
106 	iconv_t to;
107 
108 	if(codepage == 0)
109 		codepage = mtools_default_codepage;
110 	if(codepage < 0 || codepage > 9999) {
111 		fprintf(stderr, "Bad codepage %d\n", codepage);
112 		return NULL;
113 	}
114 
115 	if(getWcharCp() == NULL)
116 		return NULL;
117 
118 	sprintf(dosCp, "CP%d", codepage);
119 	from = iconv_open(wcharCp, dosCp);
120 	if(from == (iconv_t)-1) {
121 		fprintf(stderr, "Error converting to codepage %d %s\n",
122 			codepage, strerror(errno));
123 		return NULL;
124 	}
125 
126 	sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127 	to   =  iconv_open(dosCp, wcharCp);
128 	if(to == (iconv_t)-1) {
129 		/* Transliteration not supported? */
130 		sprintf(dosCp, "CP%d", codepage);
131 		to   =  iconv_open(dosCp, wcharCp);
132 	}
133 	if(to == (iconv_t)-1) {
134 		iconv_close(from);
135 		fprintf(stderr, "Error converting to codepage %d %s\n",
136 			codepage, strerror(errno));
137 		return NULL;
138 	}
139 
140 	ret = New(doscp_t);
141 	if(ret == NULL)
142 		return ret;
143 	ret->from = from;
144 	ret->to   = to;
145 	return ret;
146 }
147 
cp_close(doscp_t * cp)148 void cp_close(doscp_t *cp)
149 {
150 	iconv_close(cp->to);
151 	iconv_close(cp->from);
152 	free(cp);
153 }
154 
dos_to_wchar(doscp_t * cp,const char * dos,wchar_t * wchar,size_t len)155 int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
156 {
157 	int r;
158 	size_t in_len=len;
159 	size_t out_len=len*sizeof(wchar_t);
160 	wchar_t *dptr=wchar;
161 	char *dos2 = (char *) dos; /* Magic to be able to call iconv with its
162 				      buggy prototype */
163 	r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len);
164 	if(r < 0)
165 		return r;
166 	*dptr = L'\0';
167 	return dptr-wchar;
168 }
169 
170 /**
171  * Converts len wide character to destination. Caller's responsibility to
172  * ensure that dest is large enough.
173  * mangled will be set if there has been an untranslatable character.
174  */
safe_iconv(iconv_t conv,const wchar_t * wchar,char * dest,size_t in_len,size_t out_len,int * mangled)175 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
176 		      size_t in_len, size_t out_len, int *mangled)
177 {
178 	int r;
179 	unsigned int i;
180 	char *dptr = dest;
181 	size_t len;
182 
183 	in_len=in_len*sizeof(wchar_t);
184 
185 	while(in_len > 0 && out_len > 0) {
186 		r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
187 		if(r >= 0 || errno != EILSEQ) {
188 			/* everything transformed, or error that is _not_ a bad
189 			 * character */
190 			break;
191 		}
192 		*mangled |= 1;
193 
194 		if(out_len <= 0)
195 			break;
196 		if(dptr)
197 			*dptr++ = '_';
198 		in_len -= sizeof(wchar_t);
199 
200 		wchar++;
201 		out_len--;
202 	}
203 
204 	len = dptr-dest; /* how many dest characters have there been
205 			    generated */
206 
207 	/* eliminate question marks which might have been formed by
208 	   untransliterable characters */
209 	for(i=0; i<len; i++) {
210 		if(dest[i] == '?') {
211 			dest[i] = '_';
212 			*mangled |= 1;
213 		}
214 	}
215 	return len;
216 }
217 
wchar_to_dos(doscp_t * cp,wchar_t * wchar,char * dos,size_t len,int * mangled)218 void wchar_to_dos(doscp_t *cp,
219 		  wchar_t *wchar, char *dos, size_t len, int *mangled)
220 {
221 	safe_iconv(cp->to, wchar, dos, len, len, mangled);
222 }
223 
224 #else
225 
226 #include "codepage.h"
227 
228 struct doscp_t {
229 	unsigned char *from_dos;
230 	unsigned char to_dos[0x80];
231 };
232 
cp_open(int codepage)233 doscp_t *cp_open(int codepage)
234 {
235 	doscp_t *ret;
236 	int i;
237 	Codepage_t *cp;
238 
239 	if(codepage == 0)
240 		codepage = 850;
241 
242 	ret = New(doscp_t);
243 	if(ret == NULL)
244 		return ret;
245 
246 	for(cp=codepages; cp->nr ; cp++)
247 		if(cp->nr == codepage) {
248 			ret->from_dos = cp->tounix;
249 			break;
250 		}
251 
252 	if(ret->from_dos == NULL) {
253 		fprintf(stderr, "Bad codepage %d\n", codepage);
254 		free(ret);
255 		return NULL;
256 	}
257 
258 	for(i=0; i<0x80; i++) {
259 		char native = ret->from_dos[i];
260 		if(! (native & 0x80))
261 			continue;
262 		ret->to_dos[native & 0x7f] = 0x80 | i;
263 	}
264 	return ret;
265 }
266 
cp_close(doscp_t * cp)267 void cp_close(doscp_t *cp)
268 {
269 	free(cp);
270 }
271 
dos_to_wchar(doscp_t * cp,const char * dos,wchar_t * wchar,size_t len)272 int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
273 {
274 	int i;
275 
276 	for(i=0; i<len && dos[i]; i++) {
277 		char c = dos[i];
278 		if(c >= ' ' && c <= '~')
279 			wchar[i] = c;
280 		else {
281 			wchar[i] = cp->from_dos[c & 0x7f];
282 		}
283 	}
284 	wchar[i] = '\0';
285 	return i;
286 }
287 
288 
wchar_to_dos(doscp_t * cp,wchar_t * wchar,char * dos,size_t len,int * mangled)289 void wchar_to_dos(doscp_t *cp,
290 		  wchar_t *wchar, char *dos, size_t len, int *mangled)
291 {
292 	int i;
293 	for(i=0; i<len && wchar[i]; i++) {
294 		char c = wchar[i];
295 		if(c >= ' ' && c <= '~')
296 			dos[i] = c;
297 		else {
298 			dos[i] = cp->to_dos[c & 0x7f];
299 			if(dos[i] == '\0') {
300 				dos[i]='_';
301 				*mangled=1;
302 			}
303 		}
304 	}
305 }
306 
307 #endif
308 
309 
310 #ifndef HAVE_WCHAR_H
311 
312 typedef int mbstate_t;
313 
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)314 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
315 {
316 	*s = wc;
317 	return 1;
318 }
319 
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)320 static inline size_t mbrtowc(wchar_t *pwc, const char *s,
321 			     size_t n, mbstate_t *ps)
322 {
323 	*pwc = *s;
324 	return 1;
325 }
326 
327 #endif
328 
329 #ifdef HAVE_ICONV_H
330 
331 #include <langinfo.h>
332 
333 static iconv_t to_native = NULL;
334 
initialize_to_native(void)335 static void initialize_to_native(void)
336 {
337 	char *li, *cp;
338 	int len;
339 	if(to_native != NULL)
340 		return;
341 	li = nl_langinfo(CODESET);
342 	len = strlen(li) + 11;
343 	if(getWcharCp() == NULL)
344 		exit(1);
345 	cp = safe_malloc(len);
346 	strcpy(cp, li);
347 	strcat(cp, "//TRANSLIT");
348 	to_native = iconv_open(cp, wcharCp);
349 	if(to_native == (iconv_t) -1)
350 		to_native = iconv_open(li, wcharCp);
351 	if(to_native == (iconv_t) -1)
352 		fprintf(stderr, "Could not allocate iconv for %s\n", cp);
353 	free(cp);
354 	if(to_native == (iconv_t) -1)
355 		exit(1);
356 }
357 
358 
359 
360 #endif
361 
362 
363 /**
364  * Convert wchar string to native, converting at most len wchar characters
365  * Returns number of generated native characters
366  */
wchar_to_native(const wchar_t * wchar,char * native,size_t len,size_t out_len)367 int wchar_to_native(const wchar_t *wchar, char *native, size_t len,
368 		    size_t out_len)
369 {
370 #ifdef HAVE_ICONV_H
371 	int mangled;
372 	int r;
373 	initialize_to_native();
374 	len = wcsnlen(wchar,len);
375 	r=safe_iconv(to_native, wchar, native, len, out_len, &mangled);
376 	native[r]='\0';
377 	return r;
378 #else
379 	int i;
380 	char *dptr = native;
381 	mbstate_t ps;
382 	memset(&ps, 0, sizeof(ps));
383 	for(i=0; i<len && wchar[i] != 0; i++) {
384 		int r = wcrtomb(dptr, wchar[i], &ps);
385 		if(r < 0 && errno == EILSEQ) {
386 			r=1;
387 			*dptr='_';
388 		}
389 		if(r < 0)
390 			return r;
391 		dptr+=r;
392 	}
393 	*dptr='\0';
394 	return dptr-native;
395 #endif
396 }
397 
398 /**
399  * Convert native string to wchar string, generating at most len wchar
400  * characters. If end is supplied, stop conversion when source pointer
401  * exceeds end. Returns number of generated wchars
402  */
native_to_wchar(const char * native,wchar_t * wchar,size_t len,const char * end,int * mangled)403 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
404 		    const char *end, int *mangled)
405 {
406 	mbstate_t ps;
407 	unsigned int i;
408 	memset(&ps, 0, sizeof(ps));
409 
410 	for(i=0; i<len && (native < end || !end); i++) {
411 		int r = mbrtowc(wchar+i, native, len, &ps);
412 		if(r < 0) {
413 			/* Unconvertible character. Just pretend it's Latin1
414 			   encoded (if valid Latin1 character) or substitute
415 			   with an underscore if not
416 			*/
417 			char c = *native;
418 			if(c >= '\xa0' && c < '\xff')
419 				wchar[i] = c & 0xff;
420 			else
421 				wchar[i] = '_';
422 			memset(&ps, 0, sizeof(ps));
423 			r=1;
424 		}
425 		if(r == 0)
426 			break;
427 		native += r;
428 	}
429 	if(mangled && ((end && native < end) || (!end && *native &&  i == len)))
430 		*mangled |= 3;
431 	wchar[i]='\0';
432 	return i;
433 }
434 
435