1 /*************************************************************************
2 *
3 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
4 *   License & terms of use: http://www.unicode.org/copyright.html#License
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 *   Copyright (C) 2000-2016, International Business Machines
10 *   Corporation and others.  All Rights Reserved.
11 *
12 ***************************************************************************
13 *   file name:  convsamp.c
14 *   encoding:   ASCII (7-bit)
15 *
16 *   created on: 2000may30
17 *   created by: Steven R. Loomis
18 *
19 *   Sample code for the ICU conversion routines.
20 *
21 * Note: Nothing special is needed to build this sample. Link with
22 *       the icu UC and icu I18N libraries.
23 *
24 *       I use 'assert' for error checking, you probably will want
25 *       something more flexible.  '***BEGIN SAMPLE***' and
26 *       '***END SAMPLE***' mark pieces suitable for stand alone
27 *       code snippets.
28 *
29 *
30 *  Each test can define it's own BUFFERSIZE
31 *
32 */
33 
34 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
35 
36 #include <stdio.h>
37 #include <ctype.h>            /* for isspace, etc.    */
38 #include <assert.h>
39 #include <string.h>
40 #include <stdlib.h>  /* malloc */
41 
42 #include "cmemory.h"
43 #include "unicode/utypes.h"   /* Basic ICU data types */
44 #include "unicode/ucnv.h"     /* C   Converter API    */
45 #include "unicode/ustring.h"  /* some more string fcns*/
46 #include "unicode/uchar.h"    /* char names           */
47 #include "unicode/uloc.h"
48 #include "unicode/unistr.h"
49 
50 #include "flagcb.h"
51 
52 /* Some utility functions */
53 
54 static const UChar kNone[] = { 0x0000 };
55 
56 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
57 
58 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)59 void prettyPrintUChar(UChar c)
60 {
61   if(  (c <= 0x007F) &&
62        (isgraph(c))  ) {
63     printf(" '%c'   ", (char)(0x00FF&c));
64   } else if ( c > 0x007F ) {
65     char buf[1000];
66     UErrorCode status = U_ZERO_ERROR;
67     int32_t o;
68 
69     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
70     if(U_SUCCESS(status) && (o>0) ) {
71       buf[6] = 0;
72       printf("%7s", buf);
73     } else {
74       printf(" ??????");
75     }
76   } else {
77     switch((char)(c & 0x007F)) {
78     case ' ':
79       printf(" ' '   ");
80       break;
81     case '\t':
82       printf(" \\t    ");
83       break;
84     case '\n':
85       printf(" \\n    ");
86       break;
87     default:
88       printf("  _    ");
89       break;
90     }
91   }
92 }
93 
94 
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)95 void printUChars(const char  *name = "?",
96                  const UChar *uch  = kNone,
97                  int32_t     len   = -1 )
98 {
99   int32_t i;
100 
101   if( (len == -1) && (uch) ) {
102     len = u_strlen(uch);
103   }
104 
105   printf("%5s: ", name);
106   for( i = 0; i <len; i++) {
107     printf("%-6d ", i);
108   }
109   printf("\n");
110 
111   printf("%5s: ", "uni");
112   for( i = 0; i <len; i++) {
113     printf("\\u%04X ", (int)uch[i]);
114   }
115   printf("\n");
116 
117   printf("%5s:", "ch");
118   for( i = 0; i <len; i++) {
119     prettyPrintUChar(uch[i]);
120   }
121   printf("\n");
122 }
123 
printBytes(const char * name="?",const char * uch="",int32_t len=-1)124 void printBytes(const char  *name = "?",
125                  const char *uch  = "",
126                  int32_t     len   = -1 )
127 {
128   int32_t i;
129 
130   if( (len == -1) && (uch) ) {
131     len = strlen(uch);
132   }
133 
134   printf("%5s: ", name);
135   for( i = 0; i <len; i++) {
136     printf("%-4d ", i);
137   }
138   printf("\n");
139 
140   printf("%5s: ", "uni");
141   for( i = 0; i <len; i++) {
142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
143   }
144   printf("\n");
145 
146   printf("%5s:", "ch");
147   for( i = 0; i <len; i++) {
148     if(isgraph(0x00FF & (int)uch[i])) {
149       printf(" '%c' ", (char)uch[i]);
150     } else {
151       printf("     ");
152     }
153   }
154   printf("\n");
155 }
156 
printUChar(UChar32 ch32)157 void printUChar(UChar32 ch32)
158 {
159     if(ch32 > 0xFFFF) {
160       printf("ch: U+%06X\n", ch32);
161     }
162     else {
163       UChar ch = (UChar)ch32;
164       printUChars("C", &ch, 1);
165     }
166 }
167 
168 /*******************************************************************
169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
171 
172   This example first creates a UChar String out of the Unicode chars.
173 
174   targetSize must be set to the amount of space available in the target
175   buffer. After fromUChars is called,
176   len will contain the number of bytes in target[] which were
177   used in the resulting codepage.  In this case, there is a 1:1 mapping
178   between the input and output characters. The exclamation mark has the
179   same value in both KOI8-R and Unicode.
180 
181   src: 0      1      2      3      4      5      6
182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
184 
185  targ:  0    1    2    3    4    5    6
186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
187    ch:                                '!'
188 
189 
190 Converting FROM unicode
191   to koi8-r.
192   You must call ucnv_close to clean up the memory used by the
193   converter.
194 
195   'len' returns the number of OUTPUT bytes resulting from the
196   conversion.
197  */
198 
convsample_02()199 UErrorCode convsample_02()
200 {
201   printf("\n\n==============================================\n"
202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
203 
204 
205   // **************************** START SAMPLE *******************
206   // "cat<cat>OK"
207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
208                      0x0430, 0x0021, 0x0000 };
209   char target[100];
210   UErrorCode status = U_ZERO_ERROR;
211   UConverter *conv;
212   int32_t     len;
213 
214   // set up the converter
215   //! [ucnv_open]
216   conv = ucnv_open("koi8-r", &status);
217   //! [ucnv_open]
218   assert(U_SUCCESS(status));
219 
220   // convert to koi8-r
221   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
222   assert(U_SUCCESS(status));
223 
224   // close the converter
225   ucnv_close(conv);
226 
227   // ***************************** END SAMPLE ********************
228 
229   // Print it out
230   printUChars("src", source);
231   printf("\n");
232   printBytes("targ", target, len);
233 
234   return U_ZERO_ERROR;
235 }
236 
237 
convsample_03()238 UErrorCode convsample_03()
239 {
240   printf("\n\n==============================================\n"
241          "Sample 03: C: print out all converters\n");
242 
243   int32_t count;
244   int32_t i;
245 
246   // **************************** START SAMPLE *******************
247   count = ucnv_countAvailable();
248   printf("Available converters: %d\n", count);
249 
250   for(i=0;i<count;i++)
251   {
252     printf("%s ", ucnv_getAvailableName(i));
253   }
254 
255   // ***************************** END SAMPLE ********************
256 
257   printf("\n");
258 
259   return U_ZERO_ERROR;
260 }
261 
262 
263 
264 #define BUFFERSIZE 17 /* make it interesting :) */
265 
266 /*
267   Converting from a codepage to Unicode in bulk..
268   What is the best way to determine the buffer size?
269 
270      The 'buffersize' is in bytes of input.
271     For a given converter, divinding this by the minimum char size
272     give you the maximum number of Unicode characters that could be
273     expected for a given number of input bytes.
274      see: ucnv_getMinCharSize()
275 
276      For example, a single byte codepage like 'Latin-3' has a
277     minimum char size of 1. (It takes at least 1 byte to represent
278     each Unicode char.) So the unicode buffer has the same number of
279     UChars as the input buffer has bytes.
280 
281      In a strictly double byte codepage such as cp1362 (Windows
282     Korean), the minimum char size is 2. So, only half as many Unicode
283     chars as bytes are needed.
284 
285      This work to calculate the buffer size is an optimization. Any
286     size of input and output buffer can be used, as long as the
287     program handles the following cases: If the input buffer is empty,
288     the source pointer will be equal to sourceLimit.  If the output
289     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
290  */
291 
convsample_05()292 UErrorCode convsample_05()
293 {
294   printf("\n\n==============================================\n"
295          "Sample 05: C: count the number of letters in a UTF-8 document\n");
296 
297   FILE *f;
298   int32_t count;
299   char inBuf[BUFFERSIZE];
300   const char *source;
301   const char *sourceLimit;
302   UChar *uBuf;
303   UChar *target;
304   UChar *targetLimit;
305   UChar *p;
306   int32_t uBufSize = 0;
307   UConverter *conv;
308   UErrorCode status = U_ZERO_ERROR;
309   uint32_t letters=0, total=0;
310 
311   f = fopen("data01.txt", "r");
312   if(!f)
313   {
314     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
315     return U_FILE_ACCESS_ERROR;
316   }
317 
318   // **************************** START SAMPLE *******************
319   conv = ucnv_open("utf-8", &status);
320   assert(U_SUCCESS(status));
321 
322   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
323   printf("input bytes %d / min chars %d = %d UChars\n",
324          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
325   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
326   assert(uBuf!=NULL);
327 
328   // grab another buffer's worth
329   while((!feof(f)) &&
330         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
331   {
332     // Convert bytes to unicode
333     source = inBuf;
334     sourceLimit = inBuf + count;
335 
336     do
337     {
338         target = uBuf;
339         targetLimit = uBuf + uBufSize;
340 
341         ucnv_toUnicode(conv, &target, targetLimit,
342                        &source, sourceLimit, NULL,
343                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
344                                    /* is true (when no more data will come) */
345                        &status);
346 
347         if(status == U_BUFFER_OVERFLOW_ERROR)
348         {
349           // simply ran out of space - we'll reset the target ptr the next
350           // time through the loop.
351           status = U_ZERO_ERROR;
352         }
353         else
354         {
355           //  Check other errors here.
356           assert(U_SUCCESS(status));
357           // Break out of the loop (by force)
358         }
359 
360         // Process the Unicode
361         // Todo: handle UTF-16/surrogates
362 
363         for(p = uBuf; p<target; p++)
364         {
365           if(u_isalpha(*p))
366             letters++;
367           total++;
368         }
369     } while (source < sourceLimit); // while simply out of space
370   }
371 
372   printf("%d letters out of %d total UChars.\n", letters, total);
373 
374   // ***************************** END SAMPLE ********************
375   ucnv_close(conv);
376 
377   printf("\n");
378 
379   fclose(f);
380 
381   return U_ZERO_ERROR;
382 }
383 #undef BUFFERSIZE
384 
385 #define BUFFERSIZE 1024
386 typedef struct
387 {
388   UChar32  codepoint;
389   uint32_t frequency;
390 } CharFreqInfo;
391 
convsample_06()392 UErrorCode convsample_06()
393 {
394   printf("\n\n==============================================\n"
395          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
396 
397   FILE *f;
398   int32_t count;
399   char inBuf[BUFFERSIZE];
400   const char *source;
401   const char *sourceLimit;
402   int32_t uBufSize = 0;
403   UConverter *conv;
404   UErrorCode status = U_ZERO_ERROR;
405   uint32_t letters=0, total=0;
406 
407   CharFreqInfo   *info;
408   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
409   UChar32   p;
410 
411   uint32_t ie = 0;
412   uint32_t gh = 0;
413   UChar32 l = 0;
414 
415   f = fopen("data06.txt", "r");
416   if(!f)
417   {
418     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
419     return U_FILE_ACCESS_ERROR;
420   }
421 
422   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
423   if(!info)
424   {
425     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
426   }
427 
428   /* reset frequencies */
429   for(p=0;p<charCount;p++)
430   {
431     info[p].codepoint = p;
432     info[p].frequency = 0;
433   }
434 
435   // **************************** START SAMPLE *******************
436   conv = ucnv_open("utf-8", &status);
437   assert(U_SUCCESS(status));
438 
439   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
440   printf("input bytes %d / min chars %d = %d UChars\n",
441          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
442 
443   // grab another buffer's worth
444   while((!feof(f)) &&
445         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
446   {
447     // Convert bytes to unicode
448     source = inBuf;
449     sourceLimit = inBuf + count;
450 
451     while(source < sourceLimit)
452     {
453       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
454       if(U_FAILURE(status))
455       {
456         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
457         status = U_ZERO_ERROR;
458         continue;
459       }
460       U_ASSERT(status);
461       total++;
462 
463       if(u_isalpha(p))
464         letters++;
465 
466       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
467         ie++;
468 
469       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
470         gh++;
471 
472       if(p>charCount)
473       {
474         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
475         free(info);
476         fclose(f);
477         ucnv_close(conv);
478         return U_UNSUPPORTED_ERROR;
479       }
480       info[p].frequency++;
481       l = p;
482     }
483   }
484 
485   fclose(f);
486   ucnv_close(conv);
487 
488   printf("%d letters out of %d total UChars.\n", letters, total);
489   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
490 
491   // now, we could sort it..
492 
493   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
494 
495   for(p=0;p<charCount;p++)
496   {
497     if(info[p].frequency)
498     {
499       printf("% 5d U+%06X ", info[p].frequency, p);
500       if(p <= 0xFFFF)
501       {
502         prettyPrintUChar((UChar)p);
503       }
504       printf("\n");
505     }
506   }
507   free(info);
508   // ***************************** END SAMPLE ********************
509 
510   printf("\n");
511 
512   return U_ZERO_ERROR;
513 }
514 #undef BUFFERSIZE
515 
516 
517 /******************************************************
518   You must call ucnv_close to clean up the memory used by the
519   converter.
520 
521   'len' returns the number of OUTPUT bytes resulting from the
522   conversion.
523  */
524 
convsample_12()525 UErrorCode convsample_12()
526 {
527   printf("\n\n==============================================\n"
528          "Sample 12: C: simple sjis -> unicode conversion\n");
529 
530 
531   // **************************** START SAMPLE *******************
532 
533   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
534   UChar target[100];
535   UErrorCode status = U_ZERO_ERROR;
536   UConverter *conv;
537   int32_t     len;
538 
539   // set up the converter
540   conv = ucnv_open("shift_jis", &status);
541   assert(U_SUCCESS(status));
542 
543   // convert to Unicode
544   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
545   target[6] = 0xFDCA;
546   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
547   U_ASSERT(status);
548   // close the converter
549   ucnv_close(conv);
550 
551   // ***************************** END SAMPLE ********************
552 
553   // Print it out
554   printBytes("src", source, strlen(source) );
555   printf("\n");
556   printUChars("targ", target, len);
557 
558   return U_ZERO_ERROR;
559 }
560 
561 /******************************************************************
562    C: Convert from codepage to Unicode one at a time.
563 */
564 
convsample_13()565 UErrorCode convsample_13()
566 {
567   printf("\n\n==============================================\n"
568          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
569 
570 
571   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
572   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
573   const char *source, *sourceLimit;
574   UChar32 target;
575   UErrorCode status = U_ZERO_ERROR;
576   UConverter *conv = NULL;
577   int32_t srcCount=0;
578   int32_t dstCount=0;
579 
580   srcCount = sizeof(sourceChars);
581 
582   conv = ucnv_open("Big5", &status);
583   U_ASSERT(status);
584 
585   source = sourceChars;
586   sourceLimit = sourceChars + sizeof(sourceChars);
587 
588   // **************************** START SAMPLE *******************
589 
590 
591   printBytes("src",source,sourceLimit-source);
592 
593   while(source < sourceLimit)
594   {
595     puts("");
596     target = ucnv_getNextUChar (conv,
597                                 &source,
598                                 sourceLimit,
599                                 &status);
600 
601     //    printBytes("src",source,sourceLimit-source);
602     U_ASSERT(status);
603     printUChar(target);
604     dstCount++;
605   }
606 
607 
608   // ************************** END SAMPLE *************************
609 
610   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
611   ucnv_close(conv);
612 
613   return U_ZERO_ERROR;
614 }
615 
616 
617 
618 
convsample_20_didSubstitute(const char * source)619 UBool convsample_20_didSubstitute(const char *source)
620 {
621   UChar uchars[100];
622   char bytes[100];
623   UConverter *conv = NULL;
624   UErrorCode status = U_ZERO_ERROR;
625   uint32_t len, len2;
626   UBool  flagVal;
627 
628   FromUFLAGContext * context = NULL;
629 
630   printf("\n\n==============================================\n"
631          "Sample 20: C: Test for substitution using callbacks\n");
632 
633   /* print out the original source */
634   printBytes("src", source);
635   printf("\n");
636 
637   /* First, convert from UTF8 to unicode */
638   conv = ucnv_open("utf-8", &status);
639   U_ASSERT(status);
640 
641   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
642   U_ASSERT(status);
643 
644   printUChars("uch", uchars, len);
645   printf("\n");
646 
647   /* Now, close the converter */
648   ucnv_close(conv);
649 
650   /* Now, convert to windows-1252 */
651   conv = ucnv_open("windows-1252", &status);
652   U_ASSERT(status);
653 
654   /* Converter starts out with the SUBSTITUTE callback set. */
655 
656   /* initialize our callback */
657   context = flagCB_fromU_openContext();
658 
659   /* Set our special callback */
660   ucnv_setFromUCallBack(conv,
661                         flagCB_fromU,
662                         context,
663                         &(context->subCallback),
664                         &(context->subContext),
665                         &status);
666 
667   U_ASSERT(status);
668 
669   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
670   U_ASSERT(status);
671 
672   flagVal = context->flag;  /* it's about to go away when we close the cnv */
673 
674   ucnv_close(conv);
675 
676   /* print out the original source */
677   printBytes("bytes", bytes, len2);
678 
679   return flagVal; /* true if callback was called */
680 }
681 
convsample_20()682 UErrorCode convsample_20()
683 {
684   const char *sample1 = "abc\xdf\xbf";
685   const char *sample2 = "abc_def";
686 
687 
688   if(convsample_20_didSubstitute(sample1))
689   {
690     printf("DID substitute.\n******\n");
691   }
692   else
693   {
694     printf("Did NOT substitute.\n*****\n");
695   }
696 
697   if(convsample_20_didSubstitute(sample2))
698   {
699     printf("DID substitute.\n******\n");
700   }
701   else
702   {
703     printf("Did NOT substitute.\n*****\n");
704   }
705 
706   return U_ZERO_ERROR;
707 }
708 
709 // 21  - C, callback, with clone and debug
710 
711 
712 
convsample_21_didSubstitute(const char * source)713 UBool convsample_21_didSubstitute(const char *source)
714 {
715   UChar uchars[100];
716   char bytes[100];
717   UConverter *conv = NULL, *cloneCnv = NULL;
718   UErrorCode status = U_ZERO_ERROR;
719   uint32_t len, len2;
720   int32_t  cloneLen;
721   UBool  flagVal = FALSE;
722   UConverterFromUCallback junkCB;
723 
724   FromUFLAGContext *flagCtx = NULL,
725                    *cloneFlagCtx = NULL;
726 
727   debugCBContext   *debugCtx1 = NULL,
728                    *debugCtx2 = NULL,
729                    *cloneDebugCtx = NULL;
730 
731   printf("\n\n==============================================\n"
732          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
733 
734   /* print out the original source */
735   printBytes("src", source);
736   printf("\n");
737 
738   /* First, convert from UTF8 to unicode */
739   conv = ucnv_open("utf-8", &status);
740   U_ASSERT(status);
741 
742   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
743   U_ASSERT(status);
744 
745   printUChars("uch", uchars, len);
746   printf("\n");
747 
748   /* Now, close the converter */
749   ucnv_close(conv);
750 
751   /* Now, convert to windows-1252 */
752   conv = ucnv_open("windows-1252", &status);
753   U_ASSERT(status);
754 
755   /* Converter starts out with the SUBSTITUTE callback set. */
756 
757   /* initialize our callback */
758   /* from the 'bottom' innermost, out
759    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
760 
761 #if DEBUG_TMI
762   printf("flagCB_fromU = %p\n", &flagCB_fromU);
763   printf("debugCB_fromU = %p\n", &debugCB_fromU);
764 #endif
765 
766   debugCtx1 = debugCB_openContext();
767    flagCtx  = flagCB_fromU_openContext();
768   debugCtx2 = debugCB_openContext();
769 
770   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
771   debugCtx1->subContext  =  flagCtx;
772 
773   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
774   flagCtx->subContext    =  debugCtx2;
775 
776   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
777   debugCtx2->subContext  = NULL;
778 
779   /* Set our special callback */
780 
781   ucnv_setFromUCallBack(conv,
782                         debugCB_fromU,
783                         debugCtx1,
784                         &(debugCtx2->subCallback),
785                         &(debugCtx2->subContext),
786                         &status);
787 
788   U_ASSERT(status);
789 
790 #if DEBUG_TMI
791   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
792          conv, debugCtx1, debugCtx1->subCallback,
793          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
794 #endif
795 
796   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
797 
798   U_ASSERT(status);
799 
800 #if DEBUG_TMI
801   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
802 #endif
803 
804   ucnv_close(conv);
805 
806 #if DEBUG_TMI
807   printf("%p closed.\n", conv);
808 #endif
809 
810   U_ASSERT(status);
811   /* Now, we have to extract the context */
812   cloneDebugCtx = NULL;
813   cloneFlagCtx  = NULL;
814 
815   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
816   if(cloneDebugCtx != NULL) {
817       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
818   }
819 
820   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
821          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
822 
823   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
824   U_ASSERT(status);
825 
826   if(cloneFlagCtx != NULL) {
827       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
828   } else {
829       printf("** Warning, couldn't get the subcallback \n");
830   }
831 
832   ucnv_close(cloneCnv);
833 
834   /* print out the original source */
835   printBytes("bytes", bytes, len2);
836 
837   return flagVal; /* true if callback was called */
838 }
839 
convsample_21()840 UErrorCode convsample_21()
841 {
842   const char *sample1 = "abc\xdf\xbf";
843   const char *sample2 = "abc_def";
844 
845   if(convsample_21_didSubstitute(sample1))
846   {
847     printf("DID substitute.\n******\n");
848   }
849   else
850   {
851     printf("Did NOT substitute.\n*****\n");
852   }
853 
854   if(convsample_21_didSubstitute(sample2))
855   {
856     printf("DID substitute.\n******\n");
857   }
858   else
859   {
860     printf("Did NOT substitute.\n*****\n");
861   }
862 
863   return U_ZERO_ERROR;
864 }
865 
866 
867 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
868 
869 #define BUFFERSIZE 17 /* make it interesting :) */
870 
convsample_40()871 UErrorCode convsample_40()
872 {
873   printf("\n\n==============================================\n"
874     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
875 
876   FILE *f;
877   FILE *out;
878   int32_t count;
879   char inBuf[BUFFERSIZE];
880   const char *source;
881   const char *sourceLimit;
882   UChar *uBuf;
883   UChar *target;
884   UChar *targetLimit;
885   int32_t uBufSize = 0;
886   UConverter *conv = NULL;
887   UErrorCode status = U_ZERO_ERROR;
888   uint32_t inbytes=0, total=0;
889 
890   f = fopen("data02.bin", "rb");
891   if(!f)
892   {
893     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
894     return U_FILE_ACCESS_ERROR;
895   }
896 
897   out = fopen("data40.utf16", "wb");
898   if(!out)
899   {
900     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
901     fclose(f);
902     return U_FILE_ACCESS_ERROR;
903   }
904 
905   // **************************** START SAMPLE *******************
906   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
907   assert(U_SUCCESS(status));
908 
909   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
910   printf("input bytes %d / min chars %d = %d UChars\n",
911          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
912   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
913   assert(uBuf!=NULL);
914 
915   // grab another buffer's worth
916   while((!feof(f)) &&
917         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
918   {
919     inbytes += count;
920 
921     // Convert bytes to unicode
922     source = inBuf;
923     sourceLimit = inBuf + count;
924 
925     do
926     {
927         target = uBuf;
928         targetLimit = uBuf + uBufSize;
929 
930         ucnv_toUnicode( conv, &target, targetLimit,
931                        &source, sourceLimit, NULL,
932                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
933                                    /* is true (when no more data will come) */
934                          &status);
935 
936         if(status == U_BUFFER_OVERFLOW_ERROR)
937         {
938           // simply ran out of space - we'll reset the target ptr the next
939           // time through the loop.
940           status = U_ZERO_ERROR;
941         }
942         else
943         {
944           //  Check other errors here.
945           assert(U_SUCCESS(status));
946           // Break out of the loop (by force)
947         }
948 
949         // Process the Unicode
950         // Todo: handle UTF-16/surrogates
951         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
952                (size_t)(target-uBuf));
953         total += (target-uBuf);
954     } while (source < sourceLimit); // while simply out of space
955   }
956 
957   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
958 
959   // ***************************** END SAMPLE ********************
960   ucnv_close(conv);
961 
962   fclose(f);
963   fclose(out);
964   printf("\n");
965 
966   return U_ZERO_ERROR;
967 }
968 #undef BUFFERSIZE
969 
970 
971 
972 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
973 
974 #define BUFFERSIZE 24 /* make it interesting :) */
975 
convsample_46()976 UErrorCode convsample_46()
977 {
978   printf("\n\n==============================================\n"
979     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
980 
981   FILE *f;
982   FILE *out;
983   int32_t count;
984   UChar inBuf[BUFFERSIZE];
985   const UChar *source;
986   const UChar *sourceLimit;
987   char *buf;
988   char *target;
989   char *targetLimit;
990 
991   int32_t bufSize = 0;
992   UConverter *conv = NULL;
993   UErrorCode status = U_ZERO_ERROR;
994   uint32_t inchars=0, total=0;
995 
996   f = fopen("data40.utf16", "rb");
997   if(!f)
998   {
999     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1000     return U_FILE_ACCESS_ERROR;
1001   }
1002 
1003   out = fopen("data46.out", "wb");
1004   if(!out)
1005   {
1006     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1007     fclose(f);
1008     return U_FILE_ACCESS_ERROR;
1009   }
1010 
1011   // **************************** START SAMPLE *******************
1012   conv = ucnv_open( "iso-8859-2", &status);
1013   assert(U_SUCCESS(status));
1014 
1015   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1016   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1017          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1018   buf = (char*)malloc(bufSize * sizeof(char));
1019   assert(buf!=NULL);
1020 
1021   // grab another buffer's worth
1022   while((!feof(f)) &&
1023         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1024   {
1025     inchars += count;
1026 
1027     // Convert bytes to unicode
1028     source = inBuf;
1029     sourceLimit = inBuf + count;
1030 
1031     do
1032     {
1033         target = buf;
1034         targetLimit = buf + bufSize;
1035 
1036         ucnv_fromUnicode( conv, &target, targetLimit,
1037                        &source, sourceLimit, NULL,
1038                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1039                                    /* is true (when no more data will come) */
1040                          &status);
1041 
1042         if(status == U_BUFFER_OVERFLOW_ERROR)
1043         {
1044           // simply ran out of space - we'll reset the target ptr the next
1045           // time through the loop.
1046           status = U_ZERO_ERROR;
1047         }
1048         else
1049         {
1050           //  Check other errors here.
1051           assert(U_SUCCESS(status));
1052           // Break out of the loop (by force)
1053         }
1054 
1055         // Process the Unicode
1056         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1057                (size_t)(target-buf));
1058         total += (target-buf);
1059     } while (source < sourceLimit); // while simply out of space
1060   }
1061 
1062   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1063 
1064   // ***************************** END SAMPLE ********************
1065   ucnv_close(conv);
1066 
1067   fclose(f);
1068   fclose(out);
1069   printf("\n");
1070 
1071   return U_ZERO_ERROR;
1072 }
1073 #undef BUFFERSIZE
1074 
1075 #define BUFFERSIZE 219
1076 
convsample_50()1077 void convsample_50() {
1078   printf("\n\n==============================================\n"
1079          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1080 
1081   //! [ucnv_detectUnicodeSignature]
1082   UErrorCode err = U_ZERO_ERROR;
1083   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1084   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1085   int32_t signatureLength = 0;
1086   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1087   UConverter *conv = NULL;
1088   UChar output[100];
1089   UChar *target = output, *out;
1090   const char *source = input;
1091   if(encoding!=NULL && U_SUCCESS(err)){
1092     // should signature be discarded ?
1093     conv = ucnv_open(encoding, &err);
1094     // do the conversion
1095     ucnv_toUnicode(conv,
1096                    &target, output + UPRV_LENGTHOF(output),
1097                    &source, input + sizeof(input),
1098                    NULL, TRUE, &err);
1099     out = output;
1100     if (discardSignature){
1101       ++out; // ignore initial U+FEFF
1102     }
1103     while(out != target) {
1104       printf("%04x ", *out++);
1105     }
1106     puts("");
1107   }
1108   //! [ucnv_detectUnicodeSignature]
1109   puts("");
1110 }
1111 
1112 
1113 
1114 /* main */
1115 
main()1116 int main()
1117 {
1118 
1119   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1120 
1121   convsample_02();  // C  , u->koi8r, conv
1122   convsample_03();  // C,   iterate
1123 
1124   convsample_05();  // C,  utf8->u, getNextUChar
1125   convsample_06(); // C freq counter thingy
1126 
1127   convsample_12();  // C,  sjis->u, conv
1128   convsample_13();  // C,  big5->u, getNextU
1129 
1130   convsample_20();  // C, callback
1131   convsample_21();  // C, callback debug
1132 
1133   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1134 
1135   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1136 
1137   convsample_50();  // C, detect unicode signature
1138 
1139   printf("End of converter samples.\n");
1140 
1141   fflush(stdout);
1142   fflush(stderr);
1143 
1144   return 0;
1145 }
1146