1 /*
2  * Transcoding support for CUPS.
3  *
4  * Copyright 2007-2014 by Apple Inc.
5  * Copyright 1997-2007 by Easy Software Products.
6  *
7  * These coded instructions, statements, and computer programs are the
8  * property of Apple Inc. and are protected by Federal copyright
9  * law.  Distribution and use rights are outlined in the file "LICENSE.txt"
10  * which should have been included with this file.  If this file is
11  * missing or damaged, see the license at "http://www.cups.org/".
12  *
13  * This file is subject to the Apple OS-Developed Software exception.
14  */
15 
16 /*
17  * Include necessary headers...
18  */
19 
20 #include "cups-private.h"
21 #include <limits.h>
22 #include <time.h>
23 #ifdef HAVE_ICONV_H
24 #  include <iconv.h>
25 #endif /* HAVE_ICONV_H */
26 
27 
28 /*
29  * Local globals...
30  */
31 
32 #ifdef HAVE_ICONV_H
33 static _cups_mutex_t	map_mutex = _CUPS_MUTEX_INITIALIZER;
34 					/* Mutex to control access to maps */
35 static iconv_t		map_from_utf8 = (iconv_t)-1;
36 					/* Convert from UTF-8 to charset */
37 static iconv_t		map_to_utf8 = (iconv_t)-1;
38 					/* Convert from charset to UTF-8 */
39 static cups_encoding_t	map_encoding = CUPS_AUTO_ENCODING;
40 					/* Which charset is cached */
41 #endif /* HAVE_ICONV_H */
42 
43 
44 /*
45  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
46  */
47 
48 void
_cupsCharmapFlush(void)49 _cupsCharmapFlush(void)
50 {
51 #ifdef HAVE_ICONV_H
52   if (map_from_utf8 != (iconv_t)-1)
53   {
54     iconv_close(map_from_utf8);
55     map_from_utf8 = (iconv_t)-1;
56   }
57 
58   if (map_to_utf8 != (iconv_t)-1)
59   {
60     iconv_close(map_to_utf8);
61     map_to_utf8 = (iconv_t)-1;
62   }
63 
64   map_encoding = CUPS_AUTO_ENCODING;
65 #endif /* HAVE_ICONV_H */
66 }
67 
68 
69 /*
70  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
71  */
72 
73 int					/* O - Count or -1 on error */
cupsCharsetToUTF8(cups_utf8_t * dest,const char * src,const int maxout,const cups_encoding_t encoding)74 cupsCharsetToUTF8(
75     cups_utf8_t           *dest,	/* O - Target string */
76     const char            *src,		/* I - Source string */
77     const int             maxout,	/* I - Max output */
78     const cups_encoding_t encoding)	/* I - Encoding */
79 {
80   cups_utf8_t	*destptr;		/* Pointer into UTF-8 buffer */
81 #ifdef HAVE_ICONV_H
82   size_t	srclen,			/* Length of source string */
83 		outBytesLeft;		/* Bytes remaining in output buffer */
84 #endif /* HAVE_ICONV_H */
85 
86 
87  /*
88   * Check for valid arguments...
89   */
90 
91   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding));
92 
93   if (!dest || !src || maxout < 1)
94   {
95     if (dest)
96       *dest = '\0';
97 
98     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
99     return (-1);
100   }
101 
102  /*
103   * Handle identity conversions...
104   */
105 
106   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
107       encoding >= CUPS_ENCODING_VBCS_END)
108   {
109     strlcpy((char *)dest, src, (size_t)maxout);
110     return ((int)strlen((char *)dest));
111   }
112 
113  /*
114   * Handle ISO-8859-1 to UTF-8 directly...
115   */
116 
117   destptr = dest;
118 
119   if (encoding == CUPS_ISO8859_1)
120   {
121     int		ch;			/* Character from string */
122     cups_utf8_t	*destend;		/* End of UTF-8 buffer */
123 
124 
125     destend = dest + maxout - 2;
126 
127     while (*src && destptr < destend)
128     {
129       ch = *src++ & 255;
130 
131       if (ch & 128)
132       {
133 	*destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6));
134 	*destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
135       }
136       else
137 	*destptr++ = (cups_utf8_t)ch;
138     }
139 
140     *destptr = '\0';
141 
142     return ((int)(destptr - dest));
143   }
144 
145  /*
146   * Convert input legacy charset to UTF-8...
147   */
148 
149 #ifdef HAVE_ICONV_H
150   _cupsMutexLock(&map_mutex);
151 
152   if (map_encoding != encoding)
153   {
154     char	toset[1024];		/* Destination character set */
155 
156     _cupsCharmapFlush();
157 
158     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
159 
160     map_encoding  = encoding;
161     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
162     map_to_utf8   = iconv_open("UTF-8", toset);
163   }
164 
165   if (map_to_utf8 != (iconv_t)-1)
166   {
167     char *altdestptr = (char *)dest;	/* Silence bogus GCC type-punned */
168 
169     srclen       = strlen(src);
170     outBytesLeft = (size_t)maxout - 1;
171 
172     iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
173     *altdestptr = '\0';
174 
175     _cupsMutexUnlock(&map_mutex);
176 
177     return ((int)(altdestptr - (char *)dest));
178   }
179 
180   _cupsMutexUnlock(&map_mutex);
181 #endif /* HAVE_ICONV_H */
182 
183  /*
184   * No iconv() support, so error out...
185   */
186 
187   *destptr = '\0';
188 
189   return (-1);
190 }
191 
192 
193 /*
194  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
195  */
196 
197 int					/* O - Count or -1 on error */
cupsUTF8ToCharset(char * dest,const cups_utf8_t * src,const int maxout,const cups_encoding_t encoding)198 cupsUTF8ToCharset(
199     char		  *dest,	/* O - Target string */
200     const cups_utf8_t	  *src,		/* I - Source string */
201     const int		  maxout,	/* I - Max output */
202     const cups_encoding_t encoding)	/* I - Encoding */
203 {
204   char		*destptr;		/* Pointer into destination */
205 #ifdef HAVE_ICONV_H
206   size_t	srclen,			/* Length of source string */
207 		outBytesLeft;		/* Bytes remaining in output buffer */
208 #endif /* HAVE_ICONV_H */
209 
210 
211  /*
212   * Check for valid arguments...
213   */
214 
215   if (!dest || !src || maxout < 1)
216   {
217     if (dest)
218       *dest = '\0';
219 
220     return (-1);
221   }
222 
223  /*
224   * Handle identity conversions...
225   */
226 
227   if (encoding == CUPS_UTF8 ||
228       encoding >= CUPS_ENCODING_VBCS_END)
229   {
230     strlcpy(dest, (char *)src, (size_t)maxout);
231     return ((int)strlen(dest));
232   }
233 
234  /*
235   * Handle UTF-8 to ISO-8859-1 directly...
236   */
237 
238   destptr = dest;
239 
240   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
241   {
242     int		ch,			/* Character from string */
243 		maxch;			/* Maximum character for charset */
244     char	*destend;		/* End of ISO-8859-1 buffer */
245 
246     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
247     destend = dest + maxout - 1;
248 
249     while (*src && destptr < destend)
250     {
251       ch = *src++;
252 
253       if ((ch & 0xe0) == 0xc0)
254       {
255 	ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
256 
257 	if (ch < maxch)
258           *destptr++ = (char)ch;
259 	else
260           *destptr++ = '?';
261       }
262       else if ((ch & 0xf0) == 0xe0 ||
263                (ch & 0xf8) == 0xf0)
264         *destptr++ = '?';
265       else if (!(ch & 0x80))
266 	*destptr++ = (char)ch;
267     }
268 
269     *destptr = '\0';
270 
271     return ((int)(destptr - dest));
272   }
273 
274 #ifdef HAVE_ICONV_H
275  /*
276   * Convert input UTF-8 to legacy charset...
277   */
278 
279   _cupsMutexLock(&map_mutex);
280 
281   if (map_encoding != encoding)
282   {
283     char	toset[1024];		/* Destination character set */
284 
285     _cupsCharmapFlush();
286 
287     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
288 
289     map_encoding  = encoding;
290     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
291     map_to_utf8   = iconv_open("UTF-8", toset);
292   }
293 
294   if (map_from_utf8 != (iconv_t)-1)
295   {
296     char *altsrc = (char *)src;		/* Silence bogus GCC type-punned */
297 
298     srclen       = strlen((char *)src);
299     outBytesLeft = (size_t)maxout - 1;
300 
301     iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
302     *destptr = '\0';
303 
304     _cupsMutexUnlock(&map_mutex);
305 
306     return ((int)(destptr - dest));
307   }
308 
309   _cupsMutexUnlock(&map_mutex);
310 #endif /* HAVE_ICONV_H */
311 
312  /*
313   * No iconv() support, so error out...
314   */
315 
316   *destptr = '\0';
317 
318   return (-1);
319 }
320 
321 
322 /*
323  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
324  *
325  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
326  *
327  *   UTF-32 char     UTF-8 char(s)
328  *   --------------------------------------------------
329  *	  0 to 127 = 0xxxxxxx (US-ASCII)
330  *     128 to 2047 = 110xxxxx 10yyyyyy
331  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
332  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
333  *
334  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
335  * which would convert to five- or six-octet UTF-8 sequences...
336  */
337 
338 int					/* O - Count or -1 on error */
cupsUTF8ToUTF32(cups_utf32_t * dest,const cups_utf8_t * src,const int maxout)339 cupsUTF8ToUTF32(
340     cups_utf32_t      *dest,		/* O - Target string */
341     const cups_utf8_t *src,		/* I - Source string */
342     const int         maxout)		/* I - Max output */
343 {
344   int		i;			/* Looping variable */
345   cups_utf8_t	ch;			/* Character value */
346   cups_utf8_t	next;			/* Next character value */
347   cups_utf32_t	ch32;			/* UTF-32 character value */
348 
349 
350  /*
351   * Check for valid arguments and clear output...
352   */
353 
354   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout));
355 
356   if (dest)
357     *dest = 0;
358 
359   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
360   {
361     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
362 
363     return (-1);
364   }
365 
366  /*
367   * Convert input UTF-8 to output UTF-32...
368   */
369 
370   for (i = maxout - 1; *src && i > 0; i --)
371   {
372     ch = *src++;
373 
374    /*
375     * Convert UTF-8 character(s) to UTF-32 character...
376     */
377 
378     if (!(ch & 0x80))
379     {
380      /*
381       * One-octet UTF-8 <= 127 (US-ASCII)...
382       */
383 
384       *dest++ = ch;
385 
386       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
387       continue;
388     }
389     else if ((ch & 0xe0) == 0xc0)
390     {
391      /*
392       * Two-octet UTF-8 <= 2047 (Latin-x)...
393       */
394 
395       next = *src++;
396       if ((next & 0xc0) != 0x80)
397       {
398         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
399 
400 	return (-1);
401       }
402 
403       ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
404 
405      /*
406       * Check for non-shortest form (invalid UTF-8)...
407       */
408 
409       if (ch32 < 0x80)
410       {
411         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
412 
413 	return (-1);
414       }
415 
416       *dest++ = ch32;
417 
418       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
419                     src[-2], src[-1], (unsigned)ch32));
420     }
421     else if ((ch & 0xf0) == 0xe0)
422     {
423      /*
424       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
425       */
426 
427       next = *src++;
428       if ((next & 0xc0) != 0x80)
429       {
430         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
431 
432 	return (-1);
433       }
434 
435       ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
436 
437       next = *src++;
438       if ((next & 0xc0) != 0x80)
439       {
440         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
441 
442 	return (-1);
443       }
444 
445       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
446 
447      /*
448       * Check for non-shortest form (invalid UTF-8)...
449       */
450 
451       if (ch32 < 0x800)
452       {
453         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
454 
455 	return (-1);
456       }
457 
458       *dest++ = ch32;
459 
460       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
461                     src[-3], src[-2], src[-1], (unsigned)ch32));
462     }
463     else if ((ch & 0xf8) == 0xf0)
464     {
465      /*
466       * Four-octet UTF-8...
467       */
468 
469       next = *src++;
470       if ((next & 0xc0) != 0x80)
471       {
472         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
473 
474 	return (-1);
475       }
476 
477       ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
478 
479       next = *src++;
480       if ((next & 0xc0) != 0x80)
481       {
482         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
483 
484 	return (-1);
485       }
486 
487       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
488 
489       next = *src++;
490       if ((next & 0xc0) != 0x80)
491       {
492         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
493 
494 	return (-1);
495       }
496 
497       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
498 
499      /*
500       * Check for non-shortest form (invalid UTF-8)...
501       */
502 
503       if (ch32 < 0x10000)
504       {
505         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
506 
507 	return (-1);
508       }
509 
510       *dest++ = ch32;
511 
512       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
513                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
514     }
515     else
516     {
517      /*
518       * More than 4-octet (invalid UTF-8 sequence)...
519       */
520 
521       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
522 
523       return (-1);
524     }
525 
526    /*
527     * Check for UTF-16 surrogate (illegal UTF-8)...
528     */
529 
530     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
531       return (-1);
532   }
533 
534   *dest = 0;
535 
536   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
537 
538   return (maxout - 1 - i);
539 }
540 
541 
542 /*
543  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
544  *
545  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
546  *
547  *   UTF-32 char     UTF-8 char(s)
548  *   --------------------------------------------------
549  *	  0 to 127 = 0xxxxxxx (US-ASCII)
550  *     128 to 2047 = 110xxxxx 10yyyyyy
551  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
552  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
553  *
554  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
555  * which would convert to five- or six-octet UTF-8 sequences...
556  */
557 
558 int					/* O - Count or -1 on error */
cupsUTF32ToUTF8(cups_utf8_t * dest,const cups_utf32_t * src,const int maxout)559 cupsUTF32ToUTF8(
560     cups_utf8_t        *dest,		/* O - Target string */
561     const cups_utf32_t *src,		/* I - Source string */
562     const int          maxout)		/* I - Max output */
563 {
564   cups_utf8_t	*start;			/* Start of destination string */
565   int		i;			/* Looping variable */
566   int		swap;			/* Byte-swap input to output */
567   cups_utf32_t	ch;			/* Character value */
568 
569 
570  /*
571   * Check for valid arguments and clear output...
572   */
573 
574   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout));
575 
576   if (dest)
577     *dest = '\0';
578 
579   if (!dest || !src || maxout < 1)
580   {
581     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
582 
583     return (-1);
584   }
585 
586  /*
587   * Check for leading BOM in UTF-32 and inverted BOM...
588   */
589 
590   start = dest;
591   swap  = *src == 0xfffe0000;
592 
593   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
594 
595   if (*src == 0xfffe0000 || *src == 0xfeff)
596     src ++;
597 
598  /*
599   * Convert input UTF-32 to output UTF-8...
600   */
601 
602   for (i = maxout - 1; *src && i > 0;)
603   {
604     ch = *src++;
605 
606    /*
607     * Byte swap input UTF-32, if necessary...
608     * (only byte-swapping 24 of 32 bits)
609     */
610 
611     if (swap)
612       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
613 
614    /*
615     * Check for beyond Plane 16 (invalid UTF-32)...
616     */
617 
618     if (ch > 0x10ffff)
619     {
620       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
621 
622       return (-1);
623     }
624 
625    /*
626     * Convert UTF-32 character to UTF-8 character(s)...
627     */
628 
629     if (ch < 0x80)
630     {
631      /*
632       * One-octet UTF-8 <= 127 (US-ASCII)...
633       */
634 
635       *dest++ = (cups_utf8_t)ch;
636       i --;
637 
638       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
639     }
640     else if (ch < 0x800)
641     {
642      /*
643       * Two-octet UTF-8 <= 2047 (Latin-x)...
644       */
645 
646       if (i < 2)
647       {
648         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
649 
650         return (-1);
651       }
652 
653       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
654       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
655       i -= 2;
656 
657       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
658                     dest[-2], dest[-1]));
659     }
660     else if (ch < 0x10000)
661     {
662      /*
663       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
664       */
665 
666       if (i < 3)
667       {
668         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
669 
670         return (-1);
671       }
672 
673       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
674       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
675       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
676       i -= 3;
677 
678       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
679                     dest[-3], dest[-2], dest[-1]));
680     }
681     else
682     {
683      /*
684       * Four-octet UTF-8...
685       */
686 
687       if (i < 4)
688       {
689         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
690 
691         return (-1);
692       }
693 
694       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
695       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
696       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
697       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
698       i -= 4;
699 
700       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
701                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
702     }
703   }
704 
705   *dest = '\0';
706 
707   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
708 
709   return ((int)(dest - start));
710 }
711