1 /*****************************************************************************
2 *
3 * Copyright (C) 1999-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ******************************************************************************/
7
8 /*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
29 #include <unicode/utf16.h>
30
31 #include <stdio.h>
32 #include <errno.h>
33 #include <string.h>
34 #include <stdlib.h>
35
36 #include "cmemory.h"
37 #include "cstring.h"
38 #include "ustrfmt.h"
39
40 #include "unicode/uwmsg.h"
41
42 U_NAMESPACE_USE
43
44 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
45 #include <io.h>
46 #include <fcntl.h>
47 #if U_PLATFORM_USES_ONLY_WIN32_API
48 #define USE_FILENO_BINARY_MODE 1
49 /* Windows likes to rename Unix-like functions */
50 #ifndef fileno
51 #define fileno _fileno
52 #endif
53 #ifndef setmode
54 #define setmode _setmode
55 #endif
56 #ifndef O_BINARY
57 #define O_BINARY _O_BINARY
58 #endif
59 #endif
60 #endif
61
62 #ifdef UCONVMSG_LINK
63 /* below from the README */
64 #include "unicode/utypes.h"
65 #include "unicode/udata.h"
66 U_CFUNC char uconvmsg_dat[];
67 #endif
68
69 #define DEFAULT_BUFSZ 4096
70 #define UCONVMSG "uconvmsg"
71
72 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
73
74 /*
75 * Initialize the message bundle so that message strings can be fetched
76 * by u_wmsg().
77 *
78 */
79
initMsg(const char * pname)80 static void initMsg(const char *pname) {
81 static int ps = 0;
82
83 if (!ps) {
84 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
85 UErrorCode err = U_ZERO_ERROR;
86
87 ps = 1;
88
89 /* Set up our static data - if any */
90 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
91 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
92 if (U_FAILURE(err)) {
93 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
94 pname, u_errorName(err));
95 err = U_ZERO_ERROR; /* It may still fail */
96 }
97 #endif
98
99 /* Get messages. */
100 gBundle = u_wmsg_setPath(UCONVMSG, &err);
101 if (U_FAILURE(err)) {
102 fprintf(stderr,
103 "%s: warning: couldn't open bundle %s: %s\n",
104 pname, UCONVMSG, u_errorName(err));
105 #ifdef UCONVMSG_LINK
106 fprintf(stderr,
107 "%s: setAppData was called, internal data %s failed to load\n",
108 pname, UCONVMSG);
109 #endif
110
111 err = U_ZERO_ERROR;
112 /* that was try #1, try again with a path */
113 uprv_strcpy(dataPath, u_getDataDirectory());
114 uprv_strcat(dataPath, U_FILE_SEP_STRING);
115 uprv_strcat(dataPath, UCONVMSG);
116
117 gBundle = u_wmsg_setPath(dataPath, &err);
118 if (U_FAILURE(err)) {
119 fprintf(stderr,
120 "%s: warning: still couldn't open bundle %s: %s\n",
121 pname, dataPath, u_errorName(err));
122 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
123 }
124 }
125 }
126 }
127
128 /* Mapping of callback names to the callbacks passed to the converter
129 API. */
130
131 static struct callback_ent {
132 const char *name;
133 UConverterFromUCallback fromu;
134 const void *fromuctxt;
135 UConverterToUCallback tou;
136 const void *touctxt;
137 } transcode_callbacks[] = {
138 { "substitute",
139 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
140 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
141 { "skip",
142 UCNV_FROM_U_CALLBACK_SKIP, 0,
143 UCNV_TO_U_CALLBACK_SKIP, 0 },
144 { "stop",
145 UCNV_FROM_U_CALLBACK_STOP, 0,
146 UCNV_TO_U_CALLBACK_STOP, 0 },
147 { "escape",
148 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
149 UCNV_TO_U_CALLBACK_ESCAPE, 0},
150 { "escape-icu",
151 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
152 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
153 { "escape-java",
154 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
155 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
156 { "escape-c",
157 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
158 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
159 { "escape-xml",
160 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
161 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
162 { "escape-xml-hex",
163 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
164 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
165 { "escape-xml-dec",
166 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
167 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
168 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
170 };
171
172 /* Return a pointer to a callback record given its name. */
173
findCallback(const char * name)174 static const struct callback_ent *findCallback(const char *name) {
175 int i, count =
176 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
177
178 /* We'll do a linear search, there aren't many of them and bsearch()
179 may not be that portable. */
180
181 for (i = 0; i < count; ++i) {
182 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
183 return &transcode_callbacks[i];
184 }
185 }
186
187 return 0;
188 }
189
190 /* Print converter information. If lookfor is set, only that converter will
191 be printed, otherwise all converters will be printed. If canon is non
192 zero, tags and aliases for each converter are printed too, in the format
193 expected for convrters.txt(5). */
194
printConverters(const char * pname,const char * lookfor,UBool canon)195 static int printConverters(const char *pname, const char *lookfor,
196 UBool canon)
197 {
198 UErrorCode err = U_ZERO_ERROR;
199 int32_t num;
200 uint16_t num_stds;
201 const char **stds;
202
203 /* If there is a specified name, just handle that now. */
204
205 if (lookfor) {
206 if (!canon) {
207 printf("%s\n", lookfor);
208 return 0;
209 } else {
210 /* Because we are printing a canonical name, we need the
211 true converter name. We've done that already except for
212 the default name (because we want to print the exact
213 name one would get when calling ucnv_getDefaultName()
214 in non-canon mode). But since we do not know at this
215 point if we have the default name or something else, we
216 need to normalize again to the canonical converter
217 name. */
218
219 const char *truename = ucnv_getAlias(lookfor, 0, &err);
220 if (U_SUCCESS(err)) {
221 lookfor = truename;
222 } else {
223 err = U_ZERO_ERROR;
224 }
225 }
226 }
227
228 /* Print converter names. We come here for one of two reasons: we
229 are printing all the names (lookfor was null), or we have a
230 single converter to print but in canon mode, hence we need to
231 get to it in order to print everything. */
232
233 num = ucnv_countAvailable();
234 if (num <= 0) {
235 initMsg(pname);
236 u_wmsg(stderr, "cantGetNames");
237 return -1;
238 }
239 if (lookfor) {
240 num = 1; /* We know where we want to be. */
241 }
242
243 num_stds = ucnv_countStandards();
244 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
245 if (!stds) {
246 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
247 return -1;
248 } else {
249 uint16_t s;
250
251 if (canon) {
252 printf("{ ");
253 }
254 for (s = 0; s < num_stds; ++s) {
255 stds[s] = ucnv_getStandard(s, &err);
256 if (canon) {
257 printf("%s ", stds[s]);
258 }
259 if (U_FAILURE(err)) {
260 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
261 goto error_cleanup;
262 }
263 }
264 if (canon) {
265 puts("}");
266 }
267 }
268
269 for (int32_t i = 0; i < num; i++) {
270 const char *name;
271 uint16_t num_aliases;
272
273 /* Set the name either to what we are looking for, or
274 to the current converter name. */
275
276 if (lookfor) {
277 name = lookfor;
278 } else {
279 name = ucnv_getAvailableName(i);
280 }
281
282 /* Get all the aliases associated to the name. */
283
284 err = U_ZERO_ERROR;
285 num_aliases = ucnv_countAliases(name, &err);
286 if (U_FAILURE(err)) {
287 printf("%s", name);
288
289 UnicodeString str(name, "");
290 putchar('\t');
291 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
292 u_wmsg_errorName(err));
293 goto error_cleanup;
294 } else {
295 uint16_t a, s, t;
296
297 /* Write all the aliases and their tags. */
298
299 for (a = 0; a < num_aliases; ++a) {
300 const char *alias = ucnv_getAlias(name, a, &err);
301
302 if (U_FAILURE(err)) {
303 UnicodeString str(name, "");
304 putchar('\t');
305 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
306 u_wmsg_errorName(err));
307 goto error_cleanup;
308 }
309
310 /* Print the current alias so that it looks right. */
311 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
312 alias,
313 (canon ? "" : " "));
314
315 /* Look (slowly, linear searching) for a tag. */
316
317 if (canon) {
318 /* -1 to skip the last standard */
319 for (s = t = 0; s < num_stds-1; ++s) {
320 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
321 if (U_SUCCESS(err)) {
322 /* List the standard tags */
323 const char *standardName;
324 UBool isFirst = TRUE;
325 UErrorCode enumError = U_ZERO_ERROR;
326 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
327 /* See if this alias is supported by this standard. */
328 if (!strcmp(standardName, alias)) {
329 if (!t) {
330 printf(" {");
331 t = 1;
332 }
333 /* Print a * after the default standard name */
334 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
335 }
336 isFirst = FALSE;
337 }
338 }
339 }
340 if (t) {
341 printf(" }");
342 }
343 }
344 /* Terminate this entry. */
345 if (canon) {
346 puts("");
347 }
348
349 /* Move on. */
350 }
351 /* Terminate this entry. */
352 if (!canon) {
353 puts("");
354 }
355 }
356 }
357
358 /* Free temporary data. */
359
360 uprv_free(stds);
361
362 /* Success. */
363
364 return 0;
365 error_cleanup:
366 uprv_free(stds);
367 return -1;
368 }
369
370 /* Print all available transliterators. If canon is non zero, print
371 one transliterator per line. */
372
printTransliterators(UBool canon)373 static int printTransliterators(UBool canon)
374 {
375 #if UCONFIG_NO_TRANSLITERATION
376 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
377 return 1;
378 #else
379 UErrorCode status = U_ZERO_ERROR;
380 UEnumeration *ids = utrans_openIDs(&status);
381 int32_t i, numtrans = uenum_count(ids, &status);
382
383 char sepchar = canon ? '\n' : ' ';
384
385 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
386 int32_t len;
387 const char *nextTrans = uenum_next(ids, &len, &status);
388
389 printf("%s", nextTrans);
390 if (i < numtrans - 1) {
391 putchar(sepchar);
392 }
393 }
394
395 uenum_close(ids);
396
397 /* Add a terminating newline if needed. */
398
399 if (sepchar != '\n') {
400 putchar('\n');
401 }
402
403 /* Success. */
404
405 return 0;
406 #endif
407 }
408
409 enum {
410 uSP = 0x20, // space
411 uCR = 0xd, // carriage return
412 uLF = 0xa, // line feed
413 uNL = 0x85, // newline
414 uLS = 0x2028, // line separator
415 uPS = 0x2029, // paragraph separator
416 uSig = 0xfeff // signature/BOM character
417 };
418
419 static inline int32_t
getChunkLimit(const UnicodeString & prev,const UnicodeString & s)420 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
421 // find one of
422 // CR, LF, CRLF, NL, LS, PS
423 // for paragraph ends (see UAX #13/Unicode 4)
424 // and include it in the chunk
425 // all of these characters are on the BMP
426 // do not include FF or VT in case they are part of a paragraph
427 // (important for bidi contexts)
428 static const UChar paraEnds[] = {
429 0xd, 0xa, 0x85, 0x2028, 0x2029
430 };
431 enum {
432 iCR, iLF, iNL, iLS, iPS, iCount
433 };
434
435 // first, see if there is a CRLF split between prev and s
436 if (prev.endsWith(paraEnds + iCR, 1)) {
437 if (s.startsWith(paraEnds + iLF, 1)) {
438 return 1; // split CRLF, include the LF
439 } else if (!s.isEmpty()) {
440 return 0; // complete the last chunk
441 } else {
442 return -1; // wait for actual further contents to arrive
443 }
444 }
445
446 const UChar *u = s.getBuffer(), *limit = u + s.length();
447 UChar c;
448
449 while (u < limit) {
450 c = *u++;
451 if (
452 ((c < uSP) && (c == uCR || c == uLF)) ||
453 (c == uNL) ||
454 ((c & uLS) == uLS)
455 ) {
456 if (c == uCR) {
457 // check for CRLF
458 if (u == limit) {
459 return -1; // LF may be in the next chunk
460 } else if (*u == uLF) {
461 ++u; // include the LF in this chunk
462 }
463 }
464 return (int32_t)(u - s.getBuffer());
465 }
466 }
467
468 return -1; // continue collecting the chunk
469 }
470
471 enum {
472 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
473 CNV_WITH_FEFF, // can convert the U+FEFF signature character
474 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
475 };
476
477 static inline UChar
nibbleToHex(uint8_t n)478 nibbleToHex(uint8_t n) {
479 n &= 0xf;
480 return
481 n <= 9 ?
482 (UChar)(0x30 + n) :
483 (UChar)((0x61 - 10) + n);
484 }
485
486 // check the converter's Unicode signature properties;
487 // the fromUnicode side of the converter must be in its initial state
488 // and will be reset again if it was used
489 static int32_t
cnvSigType(UConverter * cnv)490 cnvSigType(UConverter *cnv) {
491 UErrorCode err;
492 int32_t result;
493
494 // test if the output charset can convert U+FEFF
495 USet *set = uset_open(1, 0);
496 err = U_ZERO_ERROR;
497 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
498 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
499 result = CNV_WITH_FEFF;
500 } else {
501 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
502 }
503 uset_close(set);
504
505 if (result == CNV_WITH_FEFF) {
506 // test if the output charset emits a signature anyway
507 const UChar a[1] = { 0x61 }; // "a"
508 const UChar *in;
509
510 char buffer[20];
511 char *out;
512
513 in = a;
514 out = buffer;
515 err = U_ZERO_ERROR;
516 ucnv_fromUnicode(cnv,
517 &out, buffer + sizeof(buffer),
518 &in, a + 1,
519 NULL, TRUE, &err);
520 ucnv_resetFromUnicode(cnv);
521
522 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
523 U_SUCCESS(err)
524 ) {
525 result = CNV_ADDS_FEFF;
526 }
527 }
528
529 return result;
530 }
531
532 class ConvertFile {
533 public:
ConvertFile()534 ConvertFile() :
535 buf(NULL), outbuf(NULL), fromoffsets(NULL),
536 bufsz(0), signature(0) {}
537
538 void
setBufferSize(size_t bufferSize)539 setBufferSize(size_t bufferSize) {
540 bufsz = bufferSize;
541
542 buf = new char[2 * bufsz];
543 outbuf = buf + bufsz;
544
545 // +1 for an added U+FEFF in the intermediate Unicode buffer
546 fromoffsets = new int32_t[bufsz + 1];
547 }
548
~ConvertFile()549 ~ConvertFile() {
550 delete [] buf;
551 delete [] fromoffsets;
552 }
553
554 UBool convertFile(const char *pname,
555 const char *fromcpage,
556 UConverterToUCallback toucallback,
557 const void *touctxt,
558 const char *tocpage,
559 UConverterFromUCallback fromucallback,
560 const void *fromuctxt,
561 UBool fallback,
562 const char *translit,
563 const char *infilestr,
564 FILE * outfile, int verbose);
565 private:
566 friend int main(int argc, char **argv);
567
568 char *buf, *outbuf;
569 int32_t *fromoffsets;
570
571 size_t bufsz;
572 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
573 };
574
575 // Convert a file from one encoding to another
576 UBool
convertFile(const char * pname,const char * fromcpage,UConverterToUCallback toucallback,const void * touctxt,const char * tocpage,UConverterFromUCallback fromucallback,const void * fromuctxt,UBool fallback,const char * translit,const char * infilestr,FILE * outfile,int verbose)577 ConvertFile::convertFile(const char *pname,
578 const char *fromcpage,
579 UConverterToUCallback toucallback,
580 const void *touctxt,
581 const char *tocpage,
582 UConverterFromUCallback fromucallback,
583 const void *fromuctxt,
584 UBool fallback,
585 const char *translit,
586 const char *infilestr,
587 FILE * outfile, int verbose)
588 {
589 FILE *infile;
590 UBool ret = TRUE;
591 UConverter *convfrom = 0;
592 UConverter *convto = 0;
593 UErrorCode err = U_ZERO_ERROR;
594 UBool flush;
595 UBool closeFile = FALSE;
596 const char *cbufp, *prevbufp;
597 char *bufp;
598
599 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
600
601 const UChar *unibuf, *unibufbp;
602 UChar *unibufp;
603
604 size_t rd, wr;
605
606 #if !UCONFIG_NO_TRANSLITERATION
607 Transliterator *t = 0; // Transliterator acting on Unicode data.
608 UnicodeString chunk; // One chunk of the text being collected for transformation.
609 #endif
610 UnicodeString u; // String to do the transliteration.
611 int32_t ulen;
612
613 // use conversion offsets for error messages
614 // unless a transliterator is used -
615 // a text transformation will reorder characters in unpredictable ways
616 UBool useOffsets = TRUE;
617
618 // Open the correct input file or connect to stdin for reading input
619
620 if (infilestr != 0 && strcmp(infilestr, "-")) {
621 infile = fopen(infilestr, "rb");
622 if (infile == 0) {
623 UnicodeString str1(infilestr, "");
624 str1.append((UChar32) 0);
625 UnicodeString str2(strerror(errno), "");
626 str2.append((UChar32) 0);
627 initMsg(pname);
628 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
629 return FALSE;
630 }
631 closeFile = TRUE;
632 } else {
633 infilestr = "-";
634 infile = stdin;
635 #ifdef USE_FILENO_BINARY_MODE
636 if (setmode(fileno(stdin), O_BINARY) == -1) {
637 initMsg(pname);
638 u_wmsg(stderr, "cantSetInBinMode");
639 return FALSE;
640 }
641 #endif
642 }
643
644 if (verbose) {
645 fprintf(stderr, "%s:\n", infilestr);
646 }
647
648 #if !UCONFIG_NO_TRANSLITERATION
649 // Create transliterator as needed.
650
651 if (translit != NULL && *translit) {
652 UParseError parse;
653 UnicodeString str(translit), pestr;
654
655 /* Create from rules or by ID as needed. */
656
657 parse.line = -1;
658
659 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
660 t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
661 } else {
662 t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
663 }
664
665 if (U_FAILURE(err)) {
666 str.append((UChar32) 0);
667 initMsg(pname);
668
669 if (parse.line >= 0) {
670 UChar linebuf[20], offsetbuf[20];
671 uprv_itou(linebuf, 20, parse.line, 10, 0);
672 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
673 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
674 u_wmsg_errorName(err), linebuf, offsetbuf);
675 } else {
676 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
677 u_wmsg_errorName(err));
678 }
679
680 if (t) {
681 delete t;
682 t = 0;
683 }
684 goto error_exit;
685 }
686
687 useOffsets = FALSE;
688 }
689 #endif
690
691 // Create codepage converter. If the codepage or its aliases weren't
692 // available, it returns NULL and a failure code. We also set the
693 // callbacks, and return errors in the same way.
694
695 convfrom = ucnv_open(fromcpage, &err);
696 if (U_FAILURE(err)) {
697 UnicodeString str(fromcpage, "");
698 initMsg(pname);
699 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
700 u_wmsg_errorName(err));
701 goto error_exit;
702 }
703 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
704 if (U_FAILURE(err)) {
705 initMsg(pname);
706 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
707 goto error_exit;
708 }
709
710 convto = ucnv_open(tocpage, &err);
711 if (U_FAILURE(err)) {
712 UnicodeString str(tocpage, "");
713 initMsg(pname);
714 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
715 u_wmsg_errorName(err));
716 goto error_exit;
717 }
718 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
719 if (U_FAILURE(err)) {
720 initMsg(pname);
721 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
722 goto error_exit;
723 }
724 ucnv_setFallback(convto, fallback);
725
726 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
727 int8_t sig;
728
729 // OK, we can convert now.
730 sig = signature;
731 rd = 0;
732
733 do {
734 willexit = FALSE;
735
736 // input file offset at the beginning of the next buffer
737 infoffset += rd;
738
739 rd = fread(buf, 1, bufsz, infile);
740 if (ferror(infile) != 0) {
741 UnicodeString str(strerror(errno));
742 initMsg(pname);
743 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
744 goto error_exit;
745 }
746
747 // Convert the read buffer into the new encoding via Unicode.
748 // After the call 'unibufp' will be placed behind the last
749 // character that was converted in the 'unibuf'.
750 // Also the 'cbufp' is positioned behind the last converted
751 // character.
752 // At the last conversion in the file, flush should be set to
753 // true so that we get all characters converted.
754 //
755 // The converter must be flushed at the end of conversion so
756 // that characters on hold also will be written.
757
758 cbufp = buf;
759 flush = (UBool)(rd != bufsz);
760
761 // convert until the input is consumed
762 do {
763 // remember the start of the current byte-to-Unicode conversion
764 prevbufp = cbufp;
765
766 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
767
768 // Use bufsz instead of u.getCapacity() for the targetLimit
769 // so that we don't overflow fromoffsets[].
770 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
771 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
772
773 ulen = (int32_t)(unibufp - unibuf);
774 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
775
776 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
777 // converting all of the input bytes.
778 // It works like this because ucnv_toUnicode() returns only under the
779 // following conditions:
780 // - an error occurred during conversion (an error code is set)
781 // - the target buffer is filled (the error code indicates an overflow)
782 // - the source is consumed
783 // That is, if the error code does not indicate a failure,
784 // not even an overflow, then the source must be consumed entirely.
785 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
786
787 if (err == U_BUFFER_OVERFLOW_ERROR) {
788 err = U_ZERO_ERROR;
789 } else if (U_FAILURE(err)) {
790 char pos[32], errorBytes[32];
791 int8_t i, length, errorLength;
792
793 UErrorCode localError = U_ZERO_ERROR;
794 errorLength = (int8_t)sizeof(errorBytes);
795 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
796 if (U_FAILURE(localError) || errorLength == 0) {
797 errorLength = 1;
798 }
799
800 // print the input file offset of the start of the error bytes:
801 // input file offset of the current byte buffer +
802 // length of the just consumed bytes -
803 // length of the error bytes
804 length =
805 (int8_t)sprintf(pos, "%d",
806 (int)(infoffset + (cbufp - buf) - errorLength));
807
808 // output the bytes that caused the error
809 UnicodeString str;
810 for (i = 0; i < errorLength; ++i) {
811 if (i > 0) {
812 str.append((UChar)uSP);
813 }
814 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
815 str.append(nibbleToHex((uint8_t)errorBytes[i]));
816 }
817
818 initMsg(pname);
819 u_wmsg(stderr, "problemCvtToU",
820 UnicodeString(pos, length, "").getTerminatedBuffer(),
821 str.getTerminatedBuffer(),
822 u_wmsg_errorName(err));
823
824 willexit = TRUE;
825 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
826 }
827
828 // Replaced a check for whether the input was consumed by
829 // looping until it is; message key "premEndInput" now obsolete.
830
831 if (ulen == 0) {
832 continue;
833 }
834
835 // remove a U+FEFF Unicode signature character if requested
836 if (sig < 0) {
837 if (u.charAt(0) == uSig) {
838 u.remove(0, 1);
839
840 // account for the removed UChar and offset
841 --ulen;
842
843 if (useOffsets) {
844 // remove an offset from fromoffsets[] as well
845 // to keep the array parallel with the UChars
846 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
847 }
848
849 }
850 sig = 0;
851 }
852
853 #if !UCONFIG_NO_TRANSLITERATION
854 // Transliterate/transform if needed.
855
856 // For transformation, we use chunking code -
857 // collect Unicode input until, for example, an end-of-line,
858 // then transform and output-convert that and continue collecting.
859 // This makes the transformation result independent of the buffer size
860 // while avoiding the slower keyboard mode.
861 // The end-of-chunk characters are completely included in the
862 // transformed string in case they are to be transformed themselves.
863 if (t != NULL) {
864 UnicodeString out;
865 int32_t chunkLimit;
866
867 do {
868 chunkLimit = getChunkLimit(chunk, u);
869 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
870 // use all of the rest at the end of the text
871 chunkLimit = u.length();
872 }
873 if (chunkLimit >= 0) {
874 // complete the chunk and transform it
875 chunk.append(u, 0, chunkLimit);
876 u.remove(0, chunkLimit);
877 t->transliterate(chunk);
878
879 // append the transformation result to the result and empty the chunk
880 out.append(chunk);
881 chunk.remove();
882 } else {
883 // continue collecting the chunk
884 chunk.append(u);
885 break;
886 }
887 } while (!u.isEmpty());
888
889 u = out;
890 ulen = u.length();
891 }
892 #endif
893
894 // add a U+FEFF Unicode signature character if requested
895 // and possible/necessary
896 if (sig > 0) {
897 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
898 u.insert(0, (UChar)uSig);
899
900 if (useOffsets) {
901 // insert a pseudo-offset into fromoffsets[] as well
902 // to keep the array parallel with the UChars
903 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
904 fromoffsets[0] = -1;
905 }
906
907 // account for the additional UChar and offset
908 ++ulen;
909 }
910 sig = 0;
911 }
912
913 // Convert the Unicode buffer into the destination codepage
914 // Again 'bufp' will be placed behind the last converted character
915 // And 'unibufp' will be placed behind the last converted unicode character
916 // At the last conversion flush should be set to true to ensure that
917 // all characters left get converted
918
919 unibuf = unibufbp = u.getBuffer();
920
921 do {
922 bufp = outbuf;
923
924 // Use fromSawEndOfBytes in addition to the flush flag -
925 // it indicates whether the intermediate Unicode string
926 // contains the very last UChars for the very last input bytes.
927 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
928 &unibufbp,
929 unibuf + ulen,
930 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
931
932 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
933 // converting all of the intermediate UChars.
934 // See comment for fromSawEndOfBytes.
935 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
936
937 if (err == U_BUFFER_OVERFLOW_ERROR) {
938 err = U_ZERO_ERROR;
939 } else if (U_FAILURE(err)) {
940 UChar errorUChars[4];
941 const char *errtag;
942 char pos[32];
943 UChar32 c;
944 int8_t i, length, errorLength;
945
946 UErrorCode localError = U_ZERO_ERROR;
947 errorLength = (int8_t)UPRV_LENGTHOF(errorUChars);
948 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
949 if (U_FAILURE(localError) || errorLength == 0) {
950 // need at least 1 so that we don't access beyond the length of fromoffsets[]
951 errorLength = 1;
952 }
953
954 int32_t ferroffset;
955
956 if (useOffsets) {
957 // Unicode buffer offset of the start of the error UChars
958 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
959 if (ferroffset < 0) {
960 // approximation - the character started in the previous Unicode buffer
961 ferroffset = 0;
962 }
963
964 // get the corresponding byte offset out of fromoffsets[]
965 // go back if the offset is not known for some of the UChars
966 int32_t fromoffset;
967 do {
968 fromoffset = fromoffsets[ferroffset];
969 } while (fromoffset < 0 && --ferroffset >= 0);
970
971 // total input file offset =
972 // input file offset of the current byte buffer +
973 // byte buffer offset of where the current Unicode buffer is converted from +
974 // fromoffsets[Unicode offset]
975 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
976 errtag = "problemCvtFromU";
977 } else {
978 // Do not use fromoffsets if (t != NULL) because the Unicode text may
979 // be different from what the offsets refer to.
980
981 // output file offset
982 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
983 errtag = "problemCvtFromUOut";
984 }
985
986 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
987
988 // output the code points that caused the error
989 UnicodeString str;
990 for (i = 0; i < errorLength;) {
991 if (i > 0) {
992 str.append((UChar)uSP);
993 }
994 U16_NEXT(errorUChars, i, errorLength, c);
995 if (c >= 0x100000) {
996 str.append(nibbleToHex((uint8_t)(c >> 20)));
997 }
998 if (c >= 0x10000) {
999 str.append(nibbleToHex((uint8_t)(c >> 16)));
1000 }
1001 str.append(nibbleToHex((uint8_t)(c >> 12)));
1002 str.append(nibbleToHex((uint8_t)(c >> 8)));
1003 str.append(nibbleToHex((uint8_t)(c >> 4)));
1004 str.append(nibbleToHex((uint8_t)c));
1005 }
1006
1007 initMsg(pname);
1008 u_wmsg(stderr, errtag,
1009 UnicodeString(pos, length, "").getTerminatedBuffer(),
1010 str.getTerminatedBuffer(),
1011 u_wmsg_errorName(err));
1012 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1013
1014 willexit = TRUE;
1015 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1016 }
1017
1018 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1019 // looping until they are; message key "premEnd" now obsolete.
1020
1021 // Finally, write the converted buffer to the output file
1022 size_t outlen = (size_t) (bufp - outbuf);
1023 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1024 if (wr != outlen) {
1025 UnicodeString str(strerror(errno));
1026 initMsg(pname);
1027 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1028 willexit = TRUE;
1029 }
1030
1031 if (willexit) {
1032 goto error_exit;
1033 }
1034 } while (!toSawEndOfUnicode);
1035 } while (!fromSawEndOfBytes);
1036 } while (!flush); // Stop when we have flushed the
1037 // converters (this means that it's
1038 // the end of output)
1039
1040 goto normal_exit;
1041
1042 error_exit:
1043 ret = FALSE;
1044
1045 normal_exit:
1046 // Cleanup.
1047
1048 ucnv_close(convfrom);
1049 ucnv_close(convto);
1050
1051 #if !UCONFIG_NO_TRANSLITERATION
1052 delete t;
1053 #endif
1054
1055 if (closeFile) {
1056 fclose(infile);
1057 }
1058
1059 return ret;
1060 }
1061
usage(const char * pname,int ecode)1062 static void usage(const char *pname, int ecode) {
1063 const UChar *msg;
1064 int32_t msgLen;
1065 UErrorCode err = U_ZERO_ERROR;
1066 FILE *fp = ecode ? stderr : stdout;
1067 int res;
1068
1069 initMsg(pname);
1070 msg =
1071 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1072 &msgLen, &err);
1073 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1074 UnicodeString mname(msg, msgLen + 1);
1075
1076 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1077 if (!ecode) {
1078 if (!res) {
1079 fputc('\n', fp);
1080 }
1081 if (!u_wmsg(fp, "help")) {
1082 /* Now dump callbacks and finish. */
1083
1084 int i, count =
1085 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1086 for (i = 0; i < count; ++i) {
1087 fprintf(fp, " %s", transcode_callbacks[i].name);
1088 }
1089 fputc('\n', fp);
1090 }
1091 }
1092
1093 exit(ecode);
1094 }
1095
1096 extern int
main(int argc,char ** argv)1097 main(int argc, char **argv)
1098 {
1099 FILE *outfile;
1100 int ret = 0;
1101
1102 size_t bufsz = DEFAULT_BUFSZ;
1103
1104 const char *fromcpage = 0;
1105 const char *tocpage = 0;
1106 const char *translit = 0;
1107 const char *outfilestr = 0;
1108 UBool fallback = FALSE;
1109
1110 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1111 const void *fromuctxt = 0;
1112 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1113 const void *touctxt = 0;
1114
1115 char **iter, **remainArgv, **remainArgvLimit;
1116 char **end = argv + argc;
1117
1118 const char *pname;
1119
1120 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1121 const char *printName = 0;
1122
1123 UBool verbose = FALSE;
1124 UErrorCode status = U_ZERO_ERROR;
1125
1126 ConvertFile cf;
1127
1128 /* Initialize ICU */
1129 u_init(&status);
1130 if (U_FAILURE(status)) {
1131 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
1132 argv[0], u_errorName(status));
1133 exit(1);
1134 }
1135
1136 // Get and prettify pname.
1137 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1138 #if U_PLATFORM_USES_ONLY_WIN32_API
1139 if (!pname) {
1140 pname = uprv_strrchr(*argv, '/');
1141 }
1142 #endif
1143 if (!pname) {
1144 pname = *argv;
1145 } else {
1146 ++pname;
1147 }
1148
1149 // First, get the arguments from command-line
1150 // to know the codepages to convert between
1151
1152 remainArgv = remainArgvLimit = argv + 1;
1153 for (iter = argv + 1; iter != end; iter++) {
1154 // Check for from charset
1155 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1156 iter++;
1157 if (iter != end)
1158 fromcpage = *iter;
1159 else
1160 usage(pname, 1);
1161 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1162 iter++;
1163 if (iter != end)
1164 tocpage = *iter;
1165 else
1166 usage(pname, 1);
1167 } else if (strcmp("-x", *iter) == 0) {
1168 iter++;
1169 if (iter != end)
1170 translit = *iter;
1171 else
1172 usage(pname, 1);
1173 } else if (!strcmp("--fallback", *iter)) {
1174 fallback = TRUE;
1175 } else if (!strcmp("--no-fallback", *iter)) {
1176 fallback = FALSE;
1177 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1178 iter++;
1179 if (iter != end) {
1180 bufsz = atoi(*iter);
1181 if ((int) bufsz <= 0) {
1182 initMsg(pname);
1183 UnicodeString str(*iter);
1184 initMsg(pname);
1185 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1186 return 3;
1187 }
1188 } else {
1189 usage(pname, 1);
1190 }
1191 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1192 if (printTranslits) {
1193 usage(pname, 1);
1194 }
1195 printConvs = TRUE;
1196 } else if (strcmp("--default-code", *iter) == 0) {
1197 if (printTranslits) {
1198 usage(pname, 1);
1199 }
1200 printName = ucnv_getDefaultName();
1201 } else if (strcmp("--list-code", *iter) == 0) {
1202 if (printTranslits) {
1203 usage(pname, 1);
1204 }
1205
1206 iter++;
1207 if (iter != end) {
1208 UErrorCode e = U_ZERO_ERROR;
1209 printName = ucnv_getAlias(*iter, 0, &e);
1210 if (U_FAILURE(e) || !printName) {
1211 UnicodeString str(*iter);
1212 initMsg(pname);
1213 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1214 return 2;
1215 }
1216 } else
1217 usage(pname, 1);
1218 } else if (strcmp("--canon", *iter) == 0) {
1219 printCanon = TRUE;
1220 } else if (strcmp("-L", *iter) == 0
1221 || !strcmp("--list-transliterators", *iter)) {
1222 if (printConvs) {
1223 usage(pname, 1);
1224 }
1225 printTranslits = TRUE;
1226 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1227 || !strcmp("--help", *iter)) {
1228 usage(pname, 0);
1229 } else if (!strcmp("-c", *iter)) {
1230 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1231 } else if (!strcmp("--to-callback", *iter)) {
1232 iter++;
1233 if (iter != end) {
1234 const struct callback_ent *cbe = findCallback(*iter);
1235 if (cbe) {
1236 fromucallback = cbe->fromu;
1237 fromuctxt = cbe->fromuctxt;
1238 } else {
1239 UnicodeString str(*iter);
1240 initMsg(pname);
1241 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1242 return 4;
1243 }
1244 } else {
1245 usage(pname, 1);
1246 }
1247 } else if (!strcmp("--from-callback", *iter)) {
1248 iter++;
1249 if (iter != end) {
1250 const struct callback_ent *cbe = findCallback(*iter);
1251 if (cbe) {
1252 toucallback = cbe->tou;
1253 touctxt = cbe->touctxt;
1254 } else {
1255 UnicodeString str(*iter);
1256 initMsg(pname);
1257 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1258 return 4;
1259 }
1260 } else {
1261 usage(pname, 1);
1262 }
1263 } else if (!strcmp("-i", *iter)) {
1264 toucallback = UCNV_TO_U_CALLBACK_SKIP;
1265 } else if (!strcmp("--callback", *iter)) {
1266 iter++;
1267 if (iter != end) {
1268 const struct callback_ent *cbe = findCallback(*iter);
1269 if (cbe) {
1270 fromucallback = cbe->fromu;
1271 fromuctxt = cbe->fromuctxt;
1272 toucallback = cbe->tou;
1273 touctxt = cbe->touctxt;
1274 } else {
1275 UnicodeString str(*iter);
1276 initMsg(pname);
1277 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1278 return 4;
1279 }
1280 } else {
1281 usage(pname, 1);
1282 }
1283 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1284 verbose = FALSE;
1285 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1286 verbose = TRUE;
1287 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1288 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
1289 return 0;
1290 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1291 ++iter;
1292 if (iter != end && !outfilestr) {
1293 outfilestr = *iter;
1294 } else {
1295 usage(pname, 1);
1296 }
1297 } else if (0 == strcmp("--add-signature", *iter)) {
1298 cf.signature = 1;
1299 } else if (0 == strcmp("--remove-signature", *iter)) {
1300 cf.signature = -1;
1301 } else if (**iter == '-' && (*iter)[1]) {
1302 usage(pname, 1);
1303 } else {
1304 // move a non-option up in argv[]
1305 *remainArgvLimit++ = *iter;
1306 }
1307 }
1308
1309 if (printConvs || printName) {
1310 return printConverters(pname, printName, printCanon) ? 2 : 0;
1311 } else if (printTranslits) {
1312 return printTransliterators(printCanon) ? 3 : 0;
1313 }
1314
1315 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1316 fromcpage = ucnv_getDefaultName();
1317 }
1318 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1319 tocpage = ucnv_getDefaultName();
1320 }
1321
1322 // Open the correct output file or connect to stdout for reading input
1323 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1324 outfile = fopen(outfilestr, "wb");
1325 if (outfile == 0) {
1326 UnicodeString str1(outfilestr, "");
1327 UnicodeString str2(strerror(errno), "");
1328 initMsg(pname);
1329 u_wmsg(stderr, "cantCreateOutputF",
1330 str1.getBuffer(), str2.getBuffer());
1331 return 1;
1332 }
1333 } else {
1334 outfilestr = "-";
1335 outfile = stdout;
1336 #ifdef USE_FILENO_BINARY_MODE
1337 if (setmode(fileno(outfile), O_BINARY) == -1) {
1338 u_wmsg(stderr, "cantSetOutBinMode");
1339 exit(-1);
1340 }
1341 #endif
1342 }
1343
1344 /* Loop again on the arguments to find all the input files, and
1345 convert them. */
1346
1347 cf.setBufferSize(bufsz);
1348
1349 if(remainArgv < remainArgvLimit) {
1350 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1351 if (!cf.convertFile(
1352 pname, fromcpage, toucallback, touctxt, tocpage,
1353 fromucallback, fromuctxt, fallback, translit, *iter,
1354 outfile, verbose)
1355 ) {
1356 goto error_exit;
1357 }
1358 }
1359 } else {
1360 if (!cf.convertFile(
1361 pname, fromcpage, toucallback, touctxt, tocpage,
1362 fromucallback, fromuctxt, fallback, translit, 0,
1363 outfile, verbose)
1364 ) {
1365 goto error_exit;
1366 }
1367 }
1368
1369 goto normal_exit;
1370 error_exit:
1371 #if !UCONFIG_NO_LEGACY_CONVERSION
1372 ret = 1;
1373 #else
1374 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1375 #endif
1376 normal_exit:
1377
1378 if (outfile != stdout) {
1379 fclose(outfile);
1380 }
1381
1382 u_cleanup();
1383
1384 return ret;
1385 }
1386
1387
1388 /*
1389 * Hey, Emacs, please set the following:
1390 *
1391 * Local Variables:
1392 * indent-tabs-mode: nil
1393 * End:
1394 *
1395 */
1396