1 /*
2  * Transcoding support for CUPS.
3  *
4  * Copyright 2007-2014 by Apple Inc.
5  * Copyright 1997-2007 by Easy Software Products.
6  *
7  * Licensed under Apache License v2.0.  See the file "LICENSE" for more information.
8  */
9 
10 /*
11  * Include necessary headers...
12  */
13 
14 #include "cups-private.h"
15 #include "debug-internal.h"
16 #include <limits.h>
17 #include <time.h>
18 #ifdef HAVE_ICONV_H
19 #  include <iconv.h>
20 #endif /* HAVE_ICONV_H */
21 
22 
23 /*
24  * Local globals...
25  */
26 
27 #ifdef HAVE_ICONV_H
28 static _cups_mutex_t	map_mutex = _CUPS_MUTEX_INITIALIZER;
29 					/* Mutex to control access to maps */
30 static iconv_t		map_from_utf8 = (iconv_t)-1;
31 					/* Convert from UTF-8 to charset */
32 static iconv_t		map_to_utf8 = (iconv_t)-1;
33 					/* Convert from charset to UTF-8 */
34 static cups_encoding_t	map_encoding = CUPS_AUTO_ENCODING;
35 					/* Which charset is cached */
36 #endif /* HAVE_ICONV_H */
37 
38 
39 /*
40  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
41  */
42 
43 void
_cupsCharmapFlush(void)44 _cupsCharmapFlush(void)
45 {
46 #ifdef HAVE_ICONV_H
47   if (map_from_utf8 != (iconv_t)-1)
48   {
49     iconv_close(map_from_utf8);
50     map_from_utf8 = (iconv_t)-1;
51   }
52 
53   if (map_to_utf8 != (iconv_t)-1)
54   {
55     iconv_close(map_to_utf8);
56     map_to_utf8 = (iconv_t)-1;
57   }
58 
59   map_encoding = CUPS_AUTO_ENCODING;
60 #endif /* HAVE_ICONV_H */
61 }
62 
63 
64 /*
65  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
66  */
67 
68 int					/* O - Count or -1 on error */
cupsCharsetToUTF8(cups_utf8_t * dest,const char * src,const int maxout,const cups_encoding_t encoding)69 cupsCharsetToUTF8(
70     cups_utf8_t           *dest,	/* O - Target string */
71     const char            *src,		/* I - Source string */
72     const int             maxout,	/* I - Max output */
73     const cups_encoding_t encoding)	/* I - Encoding */
74 {
75   cups_utf8_t	*destptr;		/* Pointer into UTF-8 buffer */
76 #ifdef HAVE_ICONV_H
77   size_t	srclen,			/* Length of source string */
78 		outBytesLeft;		/* Bytes remaining in output buffer */
79 #endif /* HAVE_ICONV_H */
80 
81 
82  /*
83   * Check for valid arguments...
84   */
85 
86   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding));
87 
88   if (!dest || !src || maxout < 1)
89   {
90     if (dest)
91       *dest = '\0';
92 
93     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
94     return (-1);
95   }
96 
97  /*
98   * Handle identity conversions...
99   */
100 
101   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
102       encoding >= CUPS_ENCODING_VBCS_END)
103   {
104     strlcpy((char *)dest, src, (size_t)maxout);
105     return ((int)strlen((char *)dest));
106   }
107 
108  /*
109   * Handle ISO-8859-1 to UTF-8 directly...
110   */
111 
112   destptr = dest;
113 
114   if (encoding == CUPS_ISO8859_1)
115   {
116     int		ch;			/* Character from string */
117     cups_utf8_t	*destend;		/* End of UTF-8 buffer */
118 
119 
120     destend = dest + maxout - 2;
121 
122     while (*src && destptr < destend)
123     {
124       ch = *src++ & 255;
125 
126       if (ch & 128)
127       {
128 	*destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6));
129 	*destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
130       }
131       else
132 	*destptr++ = (cups_utf8_t)ch;
133     }
134 
135     *destptr = '\0';
136 
137     return ((int)(destptr - dest));
138   }
139 
140  /*
141   * Convert input legacy charset to UTF-8...
142   */
143 
144 #ifdef HAVE_ICONV_H
145   _cupsMutexLock(&map_mutex);
146 
147   if (map_encoding != encoding)
148   {
149     char	toset[1024];		/* Destination character set */
150 
151     _cupsCharmapFlush();
152 
153     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
154 
155     map_encoding  = encoding;
156     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
157     map_to_utf8   = iconv_open("UTF-8", toset);
158   }
159 
160   if (map_to_utf8 != (iconv_t)-1)
161   {
162     char *altdestptr = (char *)dest;	/* Silence bogus GCC type-punned */
163 
164     srclen       = strlen(src);
165     outBytesLeft = (size_t)maxout - 1;
166 
167     iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
168     *altdestptr = '\0';
169 
170     _cupsMutexUnlock(&map_mutex);
171 
172     return ((int)(altdestptr - (char *)dest));
173   }
174 
175   _cupsMutexUnlock(&map_mutex);
176 #endif /* HAVE_ICONV_H */
177 
178  /*
179   * No iconv() support, so error out...
180   */
181 
182   *destptr = '\0';
183 
184   return (-1);
185 }
186 
187 
188 /*
189  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
190  */
191 
192 int					/* O - Count or -1 on error */
cupsUTF8ToCharset(char * dest,const cups_utf8_t * src,const int maxout,const cups_encoding_t encoding)193 cupsUTF8ToCharset(
194     char		  *dest,	/* O - Target string */
195     const cups_utf8_t	  *src,		/* I - Source string */
196     const int		  maxout,	/* I - Max output */
197     const cups_encoding_t encoding)	/* I - Encoding */
198 {
199   char		*destptr;		/* Pointer into destination */
200 #ifdef HAVE_ICONV_H
201   size_t	srclen,			/* Length of source string */
202 		outBytesLeft;		/* Bytes remaining in output buffer */
203 #endif /* HAVE_ICONV_H */
204 
205 
206  /*
207   * Check for valid arguments...
208   */
209 
210   if (!dest || !src || maxout < 1)
211   {
212     if (dest)
213       *dest = '\0';
214 
215     return (-1);
216   }
217 
218  /*
219   * Handle identity conversions...
220   */
221 
222   if (encoding == CUPS_UTF8 ||
223       encoding >= CUPS_ENCODING_VBCS_END)
224   {
225     strlcpy(dest, (char *)src, (size_t)maxout);
226     return ((int)strlen(dest));
227   }
228 
229  /*
230   * Handle UTF-8 to ISO-8859-1 directly...
231   */
232 
233   destptr = dest;
234 
235   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
236   {
237     int		ch,			/* Character from string */
238 		maxch;			/* Maximum character for charset */
239     char	*destend;		/* End of ISO-8859-1 buffer */
240 
241     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
242     destend = dest + maxout - 1;
243 
244     while (*src && destptr < destend)
245     {
246       ch = *src++;
247 
248       if ((ch & 0xe0) == 0xc0)
249       {
250 	ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
251 
252 	if (ch < maxch)
253           *destptr++ = (char)ch;
254 	else
255           *destptr++ = '?';
256       }
257       else if ((ch & 0xf0) == 0xe0 ||
258                (ch & 0xf8) == 0xf0)
259         *destptr++ = '?';
260       else if (!(ch & 0x80))
261 	*destptr++ = (char)ch;
262     }
263 
264     *destptr = '\0';
265 
266     return ((int)(destptr - dest));
267   }
268 
269 #ifdef HAVE_ICONV_H
270  /*
271   * Convert input UTF-8 to legacy charset...
272   */
273 
274   _cupsMutexLock(&map_mutex);
275 
276   if (map_encoding != encoding)
277   {
278     char	toset[1024];		/* Destination character set */
279 
280     _cupsCharmapFlush();
281 
282     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
283 
284     map_encoding  = encoding;
285     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
286     map_to_utf8   = iconv_open("UTF-8", toset);
287   }
288 
289   if (map_from_utf8 != (iconv_t)-1)
290   {
291     char *altsrc = (char *)src;		/* Silence bogus GCC type-punned */
292 
293     srclen       = strlen((char *)src);
294     outBytesLeft = (size_t)maxout - 1;
295 
296     iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
297     *destptr = '\0';
298 
299     _cupsMutexUnlock(&map_mutex);
300 
301     return ((int)(destptr - dest));
302   }
303 
304   _cupsMutexUnlock(&map_mutex);
305 #endif /* HAVE_ICONV_H */
306 
307  /*
308   * No iconv() support, so error out...
309   */
310 
311   *destptr = '\0';
312 
313   return (-1);
314 }
315 
316 
317 /*
318  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
319  *
320  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
321  *
322  *   UTF-32 char     UTF-8 char(s)
323  *   --------------------------------------------------
324  *	  0 to 127 = 0xxxxxxx (US-ASCII)
325  *     128 to 2047 = 110xxxxx 10yyyyyy
326  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
327  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
328  *
329  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
330  * which would convert to five- or six-octet UTF-8 sequences...
331  */
332 
333 int					/* O - Count or -1 on error */
cupsUTF8ToUTF32(cups_utf32_t * dest,const cups_utf8_t * src,const int maxout)334 cupsUTF8ToUTF32(
335     cups_utf32_t      *dest,		/* O - Target string */
336     const cups_utf8_t *src,		/* I - Source string */
337     const int         maxout)		/* I - Max output */
338 {
339   int		i;			/* Looping variable */
340   cups_utf8_t	ch;			/* Character value */
341   cups_utf8_t	next;			/* Next character value */
342   cups_utf32_t	ch32;			/* UTF-32 character value */
343 
344 
345  /*
346   * Check for valid arguments and clear output...
347   */
348 
349   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout));
350 
351   if (dest)
352     *dest = 0;
353 
354   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
355   {
356     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
357 
358     return (-1);
359   }
360 
361  /*
362   * Convert input UTF-8 to output UTF-32...
363   */
364 
365   for (i = maxout - 1; *src && i > 0; i --)
366   {
367     ch = *src++;
368 
369    /*
370     * Convert UTF-8 character(s) to UTF-32 character...
371     */
372 
373     if (!(ch & 0x80))
374     {
375      /*
376       * One-octet UTF-8 <= 127 (US-ASCII)...
377       */
378 
379       *dest++ = ch;
380 
381       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
382       continue;
383     }
384     else if ((ch & 0xe0) == 0xc0)
385     {
386      /*
387       * Two-octet UTF-8 <= 2047 (Latin-x)...
388       */
389 
390       next = *src++;
391       if ((next & 0xc0) != 0x80)
392       {
393         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
394 
395 	return (-1);
396       }
397 
398       ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
399 
400      /*
401       * Check for non-shortest form (invalid UTF-8)...
402       */
403 
404       if (ch32 < 0x80)
405       {
406         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
407 
408 	return (-1);
409       }
410 
411       *dest++ = ch32;
412 
413       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
414                     src[-2], src[-1], (unsigned)ch32));
415     }
416     else if ((ch & 0xf0) == 0xe0)
417     {
418      /*
419       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
420       */
421 
422       next = *src++;
423       if ((next & 0xc0) != 0x80)
424       {
425         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
426 
427 	return (-1);
428       }
429 
430       ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
431 
432       next = *src++;
433       if ((next & 0xc0) != 0x80)
434       {
435         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
436 
437 	return (-1);
438       }
439 
440       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
441 
442      /*
443       * Check for non-shortest form (invalid UTF-8)...
444       */
445 
446       if (ch32 < 0x800)
447       {
448         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
449 
450 	return (-1);
451       }
452 
453       *dest++ = ch32;
454 
455       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
456                     src[-3], src[-2], src[-1], (unsigned)ch32));
457     }
458     else if ((ch & 0xf8) == 0xf0)
459     {
460      /*
461       * Four-octet UTF-8...
462       */
463 
464       next = *src++;
465       if ((next & 0xc0) != 0x80)
466       {
467         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
468 
469 	return (-1);
470       }
471 
472       ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
473 
474       next = *src++;
475       if ((next & 0xc0) != 0x80)
476       {
477         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
478 
479 	return (-1);
480       }
481 
482       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
483 
484       next = *src++;
485       if ((next & 0xc0) != 0x80)
486       {
487         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
488 
489 	return (-1);
490       }
491 
492       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
493 
494      /*
495       * Check for non-shortest form (invalid UTF-8)...
496       */
497 
498       if (ch32 < 0x10000)
499       {
500         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
501 
502 	return (-1);
503       }
504 
505       *dest++ = ch32;
506 
507       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
508                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
509     }
510     else
511     {
512      /*
513       * More than 4-octet (invalid UTF-8 sequence)...
514       */
515 
516       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
517 
518       return (-1);
519     }
520 
521    /*
522     * Check for UTF-16 surrogate (illegal UTF-8)...
523     */
524 
525     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
526       return (-1);
527   }
528 
529   *dest = 0;
530 
531   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
532 
533   return (maxout - 1 - i);
534 }
535 
536 
537 /*
538  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
539  *
540  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
541  *
542  *   UTF-32 char     UTF-8 char(s)
543  *   --------------------------------------------------
544  *	  0 to 127 = 0xxxxxxx (US-ASCII)
545  *     128 to 2047 = 110xxxxx 10yyyyyy
546  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
547  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
548  *
549  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
550  * which would convert to five- or six-octet UTF-8 sequences...
551  */
552 
553 int					/* O - Count or -1 on error */
cupsUTF32ToUTF8(cups_utf8_t * dest,const cups_utf32_t * src,const int maxout)554 cupsUTF32ToUTF8(
555     cups_utf8_t        *dest,		/* O - Target string */
556     const cups_utf32_t *src,		/* I - Source string */
557     const int          maxout)		/* I - Max output */
558 {
559   cups_utf8_t	*start;			/* Start of destination string */
560   int		i;			/* Looping variable */
561   int		swap;			/* Byte-swap input to output */
562   cups_utf32_t	ch;			/* Character value */
563 
564 
565  /*
566   * Check for valid arguments and clear output...
567   */
568 
569   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout));
570 
571   if (dest)
572     *dest = '\0';
573 
574   if (!dest || !src || maxout < 1)
575   {
576     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
577 
578     return (-1);
579   }
580 
581  /*
582   * Check for leading BOM in UTF-32 and inverted BOM...
583   */
584 
585   start = dest;
586   swap  = *src == 0xfffe0000;
587 
588   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
589 
590   if (*src == 0xfffe0000 || *src == 0xfeff)
591     src ++;
592 
593  /*
594   * Convert input UTF-32 to output UTF-8...
595   */
596 
597   for (i = maxout - 1; *src && i > 0;)
598   {
599     ch = *src++;
600 
601    /*
602     * Byte swap input UTF-32, if necessary...
603     * (only byte-swapping 24 of 32 bits)
604     */
605 
606     if (swap)
607       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
608 
609    /*
610     * Check for beyond Plane 16 (invalid UTF-32)...
611     */
612 
613     if (ch > 0x10ffff)
614     {
615       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
616 
617       return (-1);
618     }
619 
620    /*
621     * Convert UTF-32 character to UTF-8 character(s)...
622     */
623 
624     if (ch < 0x80)
625     {
626      /*
627       * One-octet UTF-8 <= 127 (US-ASCII)...
628       */
629 
630       *dest++ = (cups_utf8_t)ch;
631       i --;
632 
633       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
634     }
635     else if (ch < 0x800)
636     {
637      /*
638       * Two-octet UTF-8 <= 2047 (Latin-x)...
639       */
640 
641       if (i < 2)
642       {
643         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
644 
645         return (-1);
646       }
647 
648       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
649       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
650       i -= 2;
651 
652       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
653                     dest[-2], dest[-1]));
654     }
655     else if (ch < 0x10000)
656     {
657      /*
658       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
659       */
660 
661       if (i < 3)
662       {
663         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
664 
665         return (-1);
666       }
667 
668       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
669       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
670       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
671       i -= 3;
672 
673       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
674                     dest[-3], dest[-2], dest[-1]));
675     }
676     else
677     {
678      /*
679       * Four-octet UTF-8...
680       */
681 
682       if (i < 4)
683       {
684         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
685 
686         return (-1);
687       }
688 
689       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
690       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
691       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
692       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
693       i -= 4;
694 
695       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
696                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
697     }
698   }
699 
700   *dest = '\0';
701 
702   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
703 
704   return ((int)(dest - start));
705 }
706