1 /*************************************************************************
2 *
3 * Copyright (C) 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2000-2016, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 * file name: convsamp.c
14 * encoding: ASCII (7-bit)
15 *
16 * created on: 2000may30
17 * created by: Steven R. Loomis
18 *
19 * Sample code for the ICU conversion routines.
20 *
21 * Note: Nothing special is needed to build this sample. Link with
22 * the icu UC and icu I18N libraries.
23 *
24 * I use 'assert' for error checking, you probably will want
25 * something more flexible. '***BEGIN SAMPLE***' and
26 * '***END SAMPLE***' mark pieces suitable for stand alone
27 * code snippets.
28 *
29 *
30 * Each test can define it's own BUFFERSIZE
31 *
32 */
33
34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
35
36 #include <stdio.h>
37 #include <ctype.h> /* for isspace, etc. */
38 #include <assert.h>
39 #include <string.h>
40 #include <stdlib.h> /* malloc */
41
42 #include "cmemory.h"
43 #include "unicode/utypes.h" /* Basic ICU data types */
44 #include "unicode/ucnv.h" /* C Converter API */
45 #include "unicode/ustring.h" /* some more string fcns*/
46 #include "unicode/uchar.h" /* char names */
47 #include "unicode/uloc.h"
48 #include "unicode/unistr.h"
49
50 #include "flagcb.h"
51
52 /* Some utility functions */
53
54 static const UChar kNone[] = { 0x0000 };
55
56 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
57
58 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)59 void prettyPrintUChar(UChar c)
60 {
61 if( (c <= 0x007F) &&
62 (isgraph(c)) ) {
63 printf(" '%c' ", (char)(0x00FF&c));
64 } else if ( c > 0x007F ) {
65 char buf[1000];
66 UErrorCode status = U_ZERO_ERROR;
67 int32_t o;
68
69 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
70 if(U_SUCCESS(status) && (o>0) ) {
71 buf[6] = 0;
72 printf("%7s", buf);
73 } else {
74 printf(" ??????");
75 }
76 } else {
77 switch((char)(c & 0x007F)) {
78 case ' ':
79 printf(" ' ' ");
80 break;
81 case '\t':
82 printf(" \\t ");
83 break;
84 case '\n':
85 printf(" \\n ");
86 break;
87 default:
88 printf(" _ ");
89 break;
90 }
91 }
92 }
93
94
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)95 void printUChars(const char *name = "?",
96 const UChar *uch = kNone,
97 int32_t len = -1 )
98 {
99 int32_t i;
100
101 if( (len == -1) && (uch) ) {
102 len = u_strlen(uch);
103 }
104
105 printf("%5s: ", name);
106 for( i = 0; i <len; i++) {
107 printf("%-6d ", i);
108 }
109 printf("\n");
110
111 printf("%5s: ", "uni");
112 for( i = 0; i <len; i++) {
113 printf("\\u%04X ", (int)uch[i]);
114 }
115 printf("\n");
116
117 printf("%5s:", "ch");
118 for( i = 0; i <len; i++) {
119 prettyPrintUChar(uch[i]);
120 }
121 printf("\n");
122 }
123
printBytes(const char * name="?",const char * uch="",int32_t len=-1)124 void printBytes(const char *name = "?",
125 const char *uch = "",
126 int32_t len = -1 )
127 {
128 int32_t i;
129
130 if( (len == -1) && (uch) ) {
131 len = strlen(uch);
132 }
133
134 printf("%5s: ", name);
135 for( i = 0; i <len; i++) {
136 printf("%-4d ", i);
137 }
138 printf("\n");
139
140 printf("%5s: ", "uni");
141 for( i = 0; i <len; i++) {
142 printf("\\x%02X ", 0x00FF & (int)uch[i]);
143 }
144 printf("\n");
145
146 printf("%5s:", "ch");
147 for( i = 0; i <len; i++) {
148 if(isgraph(0x00FF & (int)uch[i])) {
149 printf(" '%c' ", (char)uch[i]);
150 } else {
151 printf(" ");
152 }
153 }
154 printf("\n");
155 }
156
printUChar(UChar32 ch32)157 void printUChar(UChar32 ch32)
158 {
159 if(ch32 > 0xFFFF) {
160 printf("ch: U+%06X\n", ch32);
161 }
162 else {
163 UChar ch = (UChar)ch32;
164 printUChars("C", &ch, 1);
165 }
166 }
167
168 /*******************************************************************
169 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
170 followed by an exclamation mark (!) into the KOI8-R Russian code page.
171
172 This example first creates a UChar String out of the Unicode chars.
173
174 targetSize must be set to the amount of space available in the target
175 buffer. After fromUChars is called,
176 len will contain the number of bytes in target[] which were
177 used in the resulting codepage. In this case, there is a 1:1 mapping
178 between the input and output characters. The exclamation mark has the
179 same value in both KOI8-R and Unicode.
180
181 src: 0 1 2 3 4 5 6
182 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
183 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
184
185 targ: 0 1 2 3 4 5 6
186 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
187 ch: '!'
188
189
190 Converting FROM unicode
191 to koi8-r.
192 You must call ucnv_close to clean up the memory used by the
193 converter.
194
195 'len' returns the number of OUTPUT bytes resulting from the
196 conversion.
197 */
198
convsample_02()199 UErrorCode convsample_02()
200 {
201 printf("\n\n==============================================\n"
202 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
203
204
205 // **************************** START SAMPLE *******************
206 // "cat<cat>OK"
207 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
208 0x0430, 0x0021, 0x0000 };
209 char target[100];
210 UErrorCode status = U_ZERO_ERROR;
211 UConverter *conv;
212 int32_t len;
213
214 // set up the converter
215 //! [ucnv_open]
216 conv = ucnv_open("koi8-r", &status);
217 //! [ucnv_open]
218 assert(U_SUCCESS(status));
219
220 // convert to koi8-r
221 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
222 assert(U_SUCCESS(status));
223
224 // close the converter
225 ucnv_close(conv);
226
227 // ***************************** END SAMPLE ********************
228
229 // Print it out
230 printUChars("src", source);
231 printf("\n");
232 printBytes("targ", target, len);
233
234 return U_ZERO_ERROR;
235 }
236
237
convsample_03()238 UErrorCode convsample_03()
239 {
240 printf("\n\n==============================================\n"
241 "Sample 03: C: print out all converters\n");
242
243 int32_t count;
244 int32_t i;
245
246 // **************************** START SAMPLE *******************
247 count = ucnv_countAvailable();
248 printf("Available converters: %d\n", count);
249
250 for(i=0;i<count;i++)
251 {
252 printf("%s ", ucnv_getAvailableName(i));
253 }
254
255 // ***************************** END SAMPLE ********************
256
257 printf("\n");
258
259 return U_ZERO_ERROR;
260 }
261
262
263
264 #define BUFFERSIZE 17 /* make it interesting :) */
265
266 /*
267 Converting from a codepage to Unicode in bulk..
268 What is the best way to determine the buffer size?
269
270 The 'buffersize' is in bytes of input.
271 For a given converter, divinding this by the minimum char size
272 give you the maximum number of Unicode characters that could be
273 expected for a given number of input bytes.
274 see: ucnv_getMinCharSize()
275
276 For example, a single byte codepage like 'Latin-3' has a
277 minimum char size of 1. (It takes at least 1 byte to represent
278 each Unicode char.) So the unicode buffer has the same number of
279 UChars as the input buffer has bytes.
280
281 In a strictly double byte codepage such as cp1362 (Windows
282 Korean), the minimum char size is 2. So, only half as many Unicode
283 chars as bytes are needed.
284
285 This work to calculate the buffer size is an optimization. Any
286 size of input and output buffer can be used, as long as the
287 program handles the following cases: If the input buffer is empty,
288 the source pointer will be equal to sourceLimit. If the output
289 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
290 */
291
convsample_05()292 UErrorCode convsample_05()
293 {
294 printf("\n\n==============================================\n"
295 "Sample 05: C: count the number of letters in a UTF-8 document\n");
296
297 FILE *f;
298 int32_t count;
299 char inBuf[BUFFERSIZE];
300 const char *source;
301 const char *sourceLimit;
302 UChar *uBuf;
303 UChar *target;
304 UChar *targetLimit;
305 UChar *p;
306 int32_t uBufSize = 0;
307 UConverter *conv;
308 UErrorCode status = U_ZERO_ERROR;
309 uint32_t letters=0, total=0;
310
311 f = fopen("data01.txt", "r");
312 if(!f)
313 {
314 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
315 return U_FILE_ACCESS_ERROR;
316 }
317
318 // **************************** START SAMPLE *******************
319 conv = ucnv_open("utf-8", &status);
320 assert(U_SUCCESS(status));
321
322 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
323 printf("input bytes %d / min chars %d = %d UChars\n",
324 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
325 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
326 assert(uBuf!=NULL);
327
328 // grab another buffer's worth
329 while((!feof(f)) &&
330 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
331 {
332 // Convert bytes to unicode
333 source = inBuf;
334 sourceLimit = inBuf + count;
335
336 do
337 {
338 target = uBuf;
339 targetLimit = uBuf + uBufSize;
340
341 ucnv_toUnicode(conv, &target, targetLimit,
342 &source, sourceLimit, NULL,
343 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
344 /* is true (when no more data will come) */
345 &status);
346
347 if(status == U_BUFFER_OVERFLOW_ERROR)
348 {
349 // simply ran out of space - we'll reset the target ptr the next
350 // time through the loop.
351 status = U_ZERO_ERROR;
352 }
353 else
354 {
355 // Check other errors here.
356 assert(U_SUCCESS(status));
357 // Break out of the loop (by force)
358 }
359
360 // Process the Unicode
361 // Todo: handle UTF-16/surrogates
362
363 for(p = uBuf; p<target; p++)
364 {
365 if(u_isalpha(*p))
366 letters++;
367 total++;
368 }
369 } while (source < sourceLimit); // while simply out of space
370 }
371
372 printf("%d letters out of %d total UChars.\n", letters, total);
373
374 // ***************************** END SAMPLE ********************
375 ucnv_close(conv);
376
377 printf("\n");
378
379 fclose(f);
380
381 return U_ZERO_ERROR;
382 }
383 #undef BUFFERSIZE
384
385 #define BUFFERSIZE 1024
386 typedef struct
387 {
388 UChar32 codepoint;
389 uint32_t frequency;
390 } CharFreqInfo;
391
convsample_06()392 UErrorCode convsample_06()
393 {
394 printf("\n\n==============================================\n"
395 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
396
397 FILE *f;
398 int32_t count;
399 char inBuf[BUFFERSIZE];
400 const char *source;
401 const char *sourceLimit;
402 int32_t uBufSize = 0;
403 UConverter *conv;
404 UErrorCode status = U_ZERO_ERROR;
405 uint32_t letters=0, total=0;
406
407 CharFreqInfo *info;
408 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
409 UChar32 p;
410
411 uint32_t ie = 0;
412 uint32_t gh = 0;
413 UChar32 l = 0;
414
415 f = fopen("data06.txt", "r");
416 if(!f)
417 {
418 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
419 return U_FILE_ACCESS_ERROR;
420 }
421
422 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
423 if(!info)
424 {
425 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
426 }
427
428 /* reset frequencies */
429 for(p=0;p<charCount;p++)
430 {
431 info[p].codepoint = p;
432 info[p].frequency = 0;
433 }
434
435 // **************************** START SAMPLE *******************
436 conv = ucnv_open("utf-8", &status);
437 assert(U_SUCCESS(status));
438
439 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
440 printf("input bytes %d / min chars %d = %d UChars\n",
441 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
442
443 // grab another buffer's worth
444 while((!feof(f)) &&
445 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
446 {
447 // Convert bytes to unicode
448 source = inBuf;
449 sourceLimit = inBuf + count;
450
451 while(source < sourceLimit)
452 {
453 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
454 if(U_FAILURE(status))
455 {
456 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
457 status = U_ZERO_ERROR;
458 continue;
459 }
460 U_ASSERT(status);
461 total++;
462
463 if(u_isalpha(p))
464 letters++;
465
466 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
467 ie++;
468
469 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
470 gh++;
471
472 if(p>charCount)
473 {
474 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
475 free(info);
476 fclose(f);
477 ucnv_close(conv);
478 return U_UNSUPPORTED_ERROR;
479 }
480 info[p].frequency++;
481 l = p;
482 }
483 }
484
485 fclose(f);
486 ucnv_close(conv);
487
488 printf("%d letters out of %d total UChars.\n", letters, total);
489 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
490
491 // now, we could sort it..
492
493 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
494
495 for(p=0;p<charCount;p++)
496 {
497 if(info[p].frequency)
498 {
499 printf("% 5d U+%06X ", info[p].frequency, p);
500 if(p <= 0xFFFF)
501 {
502 prettyPrintUChar((UChar)p);
503 }
504 printf("\n");
505 }
506 }
507 free(info);
508 // ***************************** END SAMPLE ********************
509
510 printf("\n");
511
512 return U_ZERO_ERROR;
513 }
514 #undef BUFFERSIZE
515
516
517 /******************************************************
518 You must call ucnv_close to clean up the memory used by the
519 converter.
520
521 'len' returns the number of OUTPUT bytes resulting from the
522 conversion.
523 */
524
convsample_12()525 UErrorCode convsample_12()
526 {
527 printf("\n\n==============================================\n"
528 "Sample 12: C: simple sjis -> unicode conversion\n");
529
530
531 // **************************** START SAMPLE *******************
532
533 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
534 UChar target[100];
535 UErrorCode status = U_ZERO_ERROR;
536 UConverter *conv;
537 int32_t len;
538
539 // set up the converter
540 conv = ucnv_open("shift_jis", &status);
541 assert(U_SUCCESS(status));
542
543 // convert to Unicode
544 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
545 target[6] = 0xFDCA;
546 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
547 U_ASSERT(status);
548 // close the converter
549 ucnv_close(conv);
550
551 // ***************************** END SAMPLE ********************
552
553 // Print it out
554 printBytes("src", source, strlen(source) );
555 printf("\n");
556 printUChars("targ", target, len);
557
558 return U_ZERO_ERROR;
559 }
560
561 /******************************************************************
562 C: Convert from codepage to Unicode one at a time.
563 */
564
convsample_13()565 UErrorCode convsample_13()
566 {
567 printf("\n\n==============================================\n"
568 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
569
570
571 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
572 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
573 const char *source, *sourceLimit;
574 UChar32 target;
575 UErrorCode status = U_ZERO_ERROR;
576 UConverter *conv = NULL;
577 int32_t srcCount=0;
578 int32_t dstCount=0;
579
580 srcCount = sizeof(sourceChars);
581
582 conv = ucnv_open("Big5", &status);
583 U_ASSERT(status);
584
585 source = sourceChars;
586 sourceLimit = sourceChars + sizeof(sourceChars);
587
588 // **************************** START SAMPLE *******************
589
590
591 printBytes("src",source,sourceLimit-source);
592
593 while(source < sourceLimit)
594 {
595 puts("");
596 target = ucnv_getNextUChar (conv,
597 &source,
598 sourceLimit,
599 &status);
600
601 // printBytes("src",source,sourceLimit-source);
602 U_ASSERT(status);
603 printUChar(target);
604 dstCount++;
605 }
606
607
608 // ************************** END SAMPLE *************************
609
610 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
611 ucnv_close(conv);
612
613 return U_ZERO_ERROR;
614 }
615
616
617
618
convsample_20_didSubstitute(const char * source)619 UBool convsample_20_didSubstitute(const char *source)
620 {
621 UChar uchars[100];
622 char bytes[100];
623 UConverter *conv = NULL;
624 UErrorCode status = U_ZERO_ERROR;
625 uint32_t len, len2;
626 UBool flagVal;
627
628 FromUFLAGContext * context = NULL;
629
630 printf("\n\n==============================================\n"
631 "Sample 20: C: Test for substitution using callbacks\n");
632
633 /* print out the original source */
634 printBytes("src", source);
635 printf("\n");
636
637 /* First, convert from UTF8 to unicode */
638 conv = ucnv_open("utf-8", &status);
639 U_ASSERT(status);
640
641 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
642 U_ASSERT(status);
643
644 printUChars("uch", uchars, len);
645 printf("\n");
646
647 /* Now, close the converter */
648 ucnv_close(conv);
649
650 /* Now, convert to windows-1252 */
651 conv = ucnv_open("windows-1252", &status);
652 U_ASSERT(status);
653
654 /* Converter starts out with the SUBSTITUTE callback set. */
655
656 /* initialize our callback */
657 context = flagCB_fromU_openContext();
658
659 /* Set our special callback */
660 ucnv_setFromUCallBack(conv,
661 flagCB_fromU,
662 context,
663 &(context->subCallback),
664 &(context->subContext),
665 &status);
666
667 U_ASSERT(status);
668
669 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
670 U_ASSERT(status);
671
672 flagVal = context->flag; /* it's about to go away when we close the cnv */
673
674 ucnv_close(conv);
675
676 /* print out the original source */
677 printBytes("bytes", bytes, len2);
678
679 return flagVal; /* true if callback was called */
680 }
681
convsample_20()682 UErrorCode convsample_20()
683 {
684 const char *sample1 = "abc\xdf\xbf";
685 const char *sample2 = "abc_def";
686
687
688 if(convsample_20_didSubstitute(sample1))
689 {
690 printf("DID substitute.\n******\n");
691 }
692 else
693 {
694 printf("Did NOT substitute.\n*****\n");
695 }
696
697 if(convsample_20_didSubstitute(sample2))
698 {
699 printf("DID substitute.\n******\n");
700 }
701 else
702 {
703 printf("Did NOT substitute.\n*****\n");
704 }
705
706 return U_ZERO_ERROR;
707 }
708
709 // 21 - C, callback, with clone and debug
710
711
712
convsample_21_didSubstitute(const char * source)713 UBool convsample_21_didSubstitute(const char *source)
714 {
715 UChar uchars[100];
716 char bytes[100];
717 UConverter *conv = NULL, *cloneCnv = NULL;
718 UErrorCode status = U_ZERO_ERROR;
719 uint32_t len, len2;
720 int32_t cloneLen;
721 UBool flagVal = FALSE;
722 UConverterFromUCallback junkCB;
723
724 FromUFLAGContext *flagCtx = NULL,
725 *cloneFlagCtx = NULL;
726
727 debugCBContext *debugCtx1 = NULL,
728 *debugCtx2 = NULL,
729 *cloneDebugCtx = NULL;
730
731 printf("\n\n==============================================\n"
732 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
733
734 /* print out the original source */
735 printBytes("src", source);
736 printf("\n");
737
738 /* First, convert from UTF8 to unicode */
739 conv = ucnv_open("utf-8", &status);
740 U_ASSERT(status);
741
742 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
743 U_ASSERT(status);
744
745 printUChars("uch", uchars, len);
746 printf("\n");
747
748 /* Now, close the converter */
749 ucnv_close(conv);
750
751 /* Now, convert to windows-1252 */
752 conv = ucnv_open("windows-1252", &status);
753 U_ASSERT(status);
754
755 /* Converter starts out with the SUBSTITUTE callback set. */
756
757 /* initialize our callback */
758 /* from the 'bottom' innermost, out
759 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
760
761 #if DEBUG_TMI
762 printf("flagCB_fromU = %p\n", &flagCB_fromU);
763 printf("debugCB_fromU = %p\n", &debugCB_fromU);
764 #endif
765
766 debugCtx1 = debugCB_openContext();
767 flagCtx = flagCB_fromU_openContext();
768 debugCtx2 = debugCB_openContext();
769
770 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
771 debugCtx1->subContext = flagCtx;
772
773 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
774 flagCtx->subContext = debugCtx2;
775
776 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
777 debugCtx2->subContext = NULL;
778
779 /* Set our special callback */
780
781 ucnv_setFromUCallBack(conv,
782 debugCB_fromU,
783 debugCtx1,
784 &(debugCtx2->subCallback),
785 &(debugCtx2->subContext),
786 &status);
787
788 U_ASSERT(status);
789
790 #if DEBUG_TMI
791 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
792 conv, debugCtx1, debugCtx1->subCallback,
793 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
794 #endif
795
796 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
797
798 U_ASSERT(status);
799
800 #if DEBUG_TMI
801 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
802 #endif
803
804 ucnv_close(conv);
805
806 #if DEBUG_TMI
807 printf("%p closed.\n", conv);
808 #endif
809
810 U_ASSERT(status);
811 /* Now, we have to extract the context */
812 cloneDebugCtx = NULL;
813 cloneFlagCtx = NULL;
814
815 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
816 if(cloneDebugCtx != NULL) {
817 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
818 }
819
820 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
821 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
822
823 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
824 U_ASSERT(status);
825
826 if(cloneFlagCtx != NULL) {
827 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
828 } else {
829 printf("** Warning, couldn't get the subcallback \n");
830 }
831
832 ucnv_close(cloneCnv);
833
834 /* print out the original source */
835 printBytes("bytes", bytes, len2);
836
837 return flagVal; /* true if callback was called */
838 }
839
convsample_21()840 UErrorCode convsample_21()
841 {
842 const char *sample1 = "abc\xdf\xbf";
843 const char *sample2 = "abc_def";
844
845 if(convsample_21_didSubstitute(sample1))
846 {
847 printf("DID substitute.\n******\n");
848 }
849 else
850 {
851 printf("Did NOT substitute.\n*****\n");
852 }
853
854 if(convsample_21_didSubstitute(sample2))
855 {
856 printf("DID substitute.\n******\n");
857 }
858 else
859 {
860 printf("Did NOT substitute.\n*****\n");
861 }
862
863 return U_ZERO_ERROR;
864 }
865
866
867 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
868
869 #define BUFFERSIZE 17 /* make it interesting :) */
870
convsample_40()871 UErrorCode convsample_40()
872 {
873 printf("\n\n==============================================\n"
874 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
875
876 FILE *f;
877 FILE *out;
878 int32_t count;
879 char inBuf[BUFFERSIZE];
880 const char *source;
881 const char *sourceLimit;
882 UChar *uBuf;
883 UChar *target;
884 UChar *targetLimit;
885 int32_t uBufSize = 0;
886 UConverter *conv = NULL;
887 UErrorCode status = U_ZERO_ERROR;
888 uint32_t inbytes=0, total=0;
889
890 f = fopen("data02.bin", "rb");
891 if(!f)
892 {
893 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
894 return U_FILE_ACCESS_ERROR;
895 }
896
897 out = fopen("data40.utf16", "wb");
898 if(!out)
899 {
900 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
901 fclose(f);
902 return U_FILE_ACCESS_ERROR;
903 }
904
905 // **************************** START SAMPLE *******************
906 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
907 assert(U_SUCCESS(status));
908
909 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
910 printf("input bytes %d / min chars %d = %d UChars\n",
911 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
912 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
913 assert(uBuf!=NULL);
914
915 // grab another buffer's worth
916 while((!feof(f)) &&
917 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
918 {
919 inbytes += count;
920
921 // Convert bytes to unicode
922 source = inBuf;
923 sourceLimit = inBuf + count;
924
925 do
926 {
927 target = uBuf;
928 targetLimit = uBuf + uBufSize;
929
930 ucnv_toUnicode( conv, &target, targetLimit,
931 &source, sourceLimit, NULL,
932 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
933 /* is true (when no more data will come) */
934 &status);
935
936 if(status == U_BUFFER_OVERFLOW_ERROR)
937 {
938 // simply ran out of space - we'll reset the target ptr the next
939 // time through the loop.
940 status = U_ZERO_ERROR;
941 }
942 else
943 {
944 // Check other errors here.
945 assert(U_SUCCESS(status));
946 // Break out of the loop (by force)
947 }
948
949 // Process the Unicode
950 // Todo: handle UTF-16/surrogates
951 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
952 (size_t)(target-uBuf));
953 total += (target-uBuf);
954 } while (source < sourceLimit); // while simply out of space
955 }
956
957 printf("%d bytes in, %d UChars out.\n", inbytes, total);
958
959 // ***************************** END SAMPLE ********************
960 ucnv_close(conv);
961
962 fclose(f);
963 fclose(out);
964 printf("\n");
965
966 return U_ZERO_ERROR;
967 }
968 #undef BUFFERSIZE
969
970
971
972 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
973
974 #define BUFFERSIZE 24 /* make it interesting :) */
975
convsample_46()976 UErrorCode convsample_46()
977 {
978 printf("\n\n==============================================\n"
979 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
980
981 FILE *f;
982 FILE *out;
983 int32_t count;
984 UChar inBuf[BUFFERSIZE];
985 const UChar *source;
986 const UChar *sourceLimit;
987 char *buf;
988 char *target;
989 char *targetLimit;
990
991 int32_t bufSize = 0;
992 UConverter *conv = NULL;
993 UErrorCode status = U_ZERO_ERROR;
994 uint32_t inchars=0, total=0;
995
996 f = fopen("data40.utf16", "rb");
997 if(!f)
998 {
999 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1000 return U_FILE_ACCESS_ERROR;
1001 }
1002
1003 out = fopen("data46.out", "wb");
1004 if(!out)
1005 {
1006 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1007 fclose(f);
1008 return U_FILE_ACCESS_ERROR;
1009 }
1010
1011 // **************************** START SAMPLE *******************
1012 conv = ucnv_open( "iso-8859-2", &status);
1013 assert(U_SUCCESS(status));
1014
1015 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1016 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1017 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1018 buf = (char*)malloc(bufSize * sizeof(char));
1019 assert(buf!=NULL);
1020
1021 // grab another buffer's worth
1022 while((!feof(f)) &&
1023 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1024 {
1025 inchars += count;
1026
1027 // Convert bytes to unicode
1028 source = inBuf;
1029 sourceLimit = inBuf + count;
1030
1031 do
1032 {
1033 target = buf;
1034 targetLimit = buf + bufSize;
1035
1036 ucnv_fromUnicode( conv, &target, targetLimit,
1037 &source, sourceLimit, NULL,
1038 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1039 /* is true (when no more data will come) */
1040 &status);
1041
1042 if(status == U_BUFFER_OVERFLOW_ERROR)
1043 {
1044 // simply ran out of space - we'll reset the target ptr the next
1045 // time through the loop.
1046 status = U_ZERO_ERROR;
1047 }
1048 else
1049 {
1050 // Check other errors here.
1051 assert(U_SUCCESS(status));
1052 // Break out of the loop (by force)
1053 }
1054
1055 // Process the Unicode
1056 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1057 (size_t)(target-buf));
1058 total += (target-buf);
1059 } while (source < sourceLimit); // while simply out of space
1060 }
1061
1062 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1063
1064 // ***************************** END SAMPLE ********************
1065 ucnv_close(conv);
1066
1067 fclose(f);
1068 fclose(out);
1069 printf("\n");
1070
1071 return U_ZERO_ERROR;
1072 }
1073 #undef BUFFERSIZE
1074
1075 #define BUFFERSIZE 219
1076
convsample_50()1077 void convsample_50() {
1078 printf("\n\n==============================================\n"
1079 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1080
1081 //! [ucnv_detectUnicodeSignature]
1082 UErrorCode err = U_ZERO_ERROR;
1083 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1084 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1085 int32_t signatureLength = 0;
1086 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1087 UConverter *conv = NULL;
1088 UChar output[100];
1089 UChar *target = output, *out;
1090 const char *source = input;
1091 if(encoding!=NULL && U_SUCCESS(err)){
1092 // should signature be discarded ?
1093 conv = ucnv_open(encoding, &err);
1094 // do the conversion
1095 ucnv_toUnicode(conv,
1096 &target, output + UPRV_LENGTHOF(output),
1097 &source, input + sizeof(input),
1098 NULL, TRUE, &err);
1099 out = output;
1100 if (discardSignature){
1101 ++out; // ignore initial U+FEFF
1102 }
1103 while(out != target) {
1104 printf("%04x ", *out++);
1105 }
1106 puts("");
1107 }
1108 //! [ucnv_detectUnicodeSignature]
1109 puts("");
1110 }
1111
1112
1113
1114 /* main */
1115
main()1116 int main()
1117 {
1118
1119 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1120
1121 convsample_02(); // C , u->koi8r, conv
1122 convsample_03(); // C, iterate
1123
1124 convsample_05(); // C, utf8->u, getNextUChar
1125 convsample_06(); // C freq counter thingy
1126
1127 convsample_12(); // C, sjis->u, conv
1128 convsample_13(); // C, big5->u, getNextU
1129
1130 convsample_20(); // C, callback
1131 convsample_21(); // C, callback debug
1132
1133 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1134
1135 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1136
1137 convsample_50(); // C, detect unicode signature
1138
1139 printf("End of converter samples.\n");
1140
1141 fflush(stdout);
1142 fflush(stderr);
1143
1144 return 0;
1145 }
1146