1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2009-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File gencfu.c
10 */
11 
12 //--------------------------------------------------------------------
13 //
14 //   Tool for generating Unicode Confusable data files (.cfu files).
15 //   .cfu files contain the compiled of the confusable data
16 //   derived from the Unicode Consortium data described in
17 //   Unicode UAX 39.
18 //
19 //   Usage:  gencfu [options] -r confusables-file.txt -o output-file.cfu
20 //
21 //       options:   -v         verbose
22 //                  -? or -h   help
23 //
24 //   The input rule filew is are plain text files containing confusable character
25 //    definitions in the input format defined by Unicode UAX39 for the files
26 //    confusables.txt.  This source (.txt) format
27 //    is also accepted direaccepted by ICU spoof detedtors.  The
28 //    files must be encoded in utf-8 format, with or without a BOM.
29 //
30 //   The script used to compile confusablesWholeScript.txt into the CFU file
31 //    until the Unicode consortium deprecated it.
32 //
33 //--------------------------------------------------------------------
34 
35 #include "unicode/utypes.h"
36 #include "unicode/unistr.h"
37 #include "unicode/uclean.h"
38 #include "unicode/udata.h"
39 #include "unicode/putil.h"
40 
41 #include "uoptions.h"
42 #include "unewdata.h"
43 #include "ucmndata.h"
44 #include "uspoof_impl.h"
45 #include "cmemory.h"
46 
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 
51 U_NAMESPACE_USE
52 
53 static char *progName;
54 static UOption options[]={
55     UOPTION_HELP_H,             /* 0 */
56     UOPTION_HELP_QUESTION_MARK, /* 1 */
57     UOPTION_VERBOSE,            /* 2 */
58     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
59     { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */  // deprecated
60     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
61     UOPTION_ICUDATADIR,         /* 6 */
62     UOPTION_DESTDIR,            /* 7 */
63     UOPTION_COPYRIGHT,          /* 8 */
64     UOPTION_QUIET,              /* 9 */
65 };
66 
usageAndDie(int retCode)67 void usageAndDie(int retCode) {
68         printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
69         printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
70             "options:\n"
71             "\t-h or -? or --help  this usage text\n"
72             "\t-V or --version     show a version message\n"
73             "\t-c or --copyright   include a copyright notice\n"
74             "\t-v or --verbose     turn on verbose output\n"
75             "\t-q or --quiet       do not display warnings and progress\n"
76             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
77             "\t                    followed by path, defaults to %s\n"
78             "\t-d or --destdir     destination directory, followed by the path\n",
79             u_getDataDirectory());
80         exit (retCode);
81 }
82 
83 
84 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
85 
86 /* dummy UDataInfo cf. udata.h */
87 static UDataInfo dummyDataInfo = {
88     sizeof(UDataInfo),
89     0,
90 
91     U_IS_BIG_ENDIAN,
92     U_CHARSET_FAMILY,
93     U_SIZEOF_UCHAR,
94     0,
95 
96     { 0, 0, 0, 0 },                 /* dummy dataFormat */
97     { 0, 0, 0, 0 },                 /* dummy formatVersion */
98     { 0, 0, 0, 0 }                  /* dummy dataVersion */
99 };
100 
101 #else
102 
103 //
104 //  Set up the ICU data header, defined in ucmndata.h
105 //
106 DataHeader dh ={
107     {sizeof(DataHeader),           // Struct MappedData
108         0xda,
109         0x27},
110 
111     {                               // struct UDataInfo
112         sizeof(UDataInfo),          //     size
113         0,                          //     reserved
114         U_IS_BIG_ENDIAN,
115         U_CHARSET_FAMILY,
116         U_SIZEOF_UCHAR,
117         0,                          //     reserved
118 
119     { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
120     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
121                                     //      from the  builder.  The  values declared
122                                     //      here should never appear in any real data.
123         { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
124     }};
125 
126 #endif
127 
128 // Forward declaration for function for reading source files.
129 static const char *readFile(const char *fileName, int32_t *len);
130 
131 //----------------------------------------------------------------------------
132 //
133 //  main      for gencfu
134 //
135 //----------------------------------------------------------------------------
main(int argc,char ** argv)136 int  main(int argc, char **argv) {
137     UErrorCode  status = U_ZERO_ERROR;
138     const char *confFileName;
139     const char *outFileName;
140     const char *outDir = NULL;
141     const char *copyright = NULL;
142 
143     //
144     // Pick up and check the command line arguments,
145     //    using the standard ICU tool utils option handling.
146     //
147     U_MAIN_INIT_ARGS(argc, argv);
148     progName = argv[0];
149     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
150     if(argc<0) {
151         // Unrecognized option
152         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
153         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
154     }
155 
156     if(options[0].doesOccur || options[1].doesOccur) {
157         //  -? or -h for help.
158         usageAndDie(0);
159     }
160 
161     if (!(options[3].doesOccur && options[5].doesOccur)) {
162         fprintf(stderr, "confusables file and output file must all be specified.\n");
163         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
164     }
165     confFileName   = options[3].value;
166     outFileName    = options[5].value;
167 
168     if (options[6].doesOccur) {
169         u_setDataDirectory(options[6].value);
170     }
171 
172     status = U_ZERO_ERROR;
173 
174     /* Combine the directory with the file name */
175     if(options[7].doesOccur) {
176         outDir = options[7].value;
177     }
178     if (options[8].doesOccur) {
179         copyright = U_COPYRIGHT_STRING;
180     }
181 
182     UBool quiet = FALSE;
183     if (options[9].doesOccur) {
184       quiet = TRUE;
185     }
186 
187 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
188     // spoof detection data file parsing is dependent on regular expressions.
189     // TODO: have the tool return an error status.  Requires fixing the ICU data build
190     //       so that it doesn't abort entirely on that error.
191 
192     UNewDataMemory *pData;
193     char msg[1024];
194 
195     /* write message with just the name */
196     sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
197     fprintf(stderr, "%s\n", msg);
198 
199     /* write the dummy data file */
200     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
201     udata_writeBlock(pData, msg, strlen(msg));
202     udata_finish(pData, &status);
203     return (int)status;
204 
205 #else
206     /* Initialize ICU */
207     u_init(&status);
208     if (U_FAILURE(status)) {
209         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
210             argv[0], u_errorName(status));
211         exit(1);
212     }
213     status = U_ZERO_ERROR;
214 
215     //  Read in the confusables source file
216 
217     int32_t      confusablesLen = 0;
218     const char  *confusables = readFile(confFileName, &confusablesLen);
219     if (confusables == NULL) {
220         printf("gencfu: error reading file  \"%s\"\n", confFileName);
221         exit(-1);
222     }
223 
224     //
225     //  Create the Spoof Detector from the source confusables files.
226     //     This will compile the data.
227     //
228     UParseError parseError;
229     parseError.line = 0;
230     parseError.offset = 0;
231     int32_t errType;
232     USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
233                                               NULL, 0,
234                                               &errType, &parseError, &status);
235     if (U_FAILURE(status)) {
236         fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
237                 u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
238         exit(status);
239     };
240 
241 
242     //
243     //  Get the compiled rule data from the USpoofChecker.
244     //
245     uint32_t        outDataSize;
246     uint8_t        *outData;
247     outDataSize = uspoof_serialize(sc, NULL, 0, &status);
248     if (status != U_BUFFER_OVERFLOW_ERROR) {
249         fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
250         exit(status);
251     }
252     status = U_ZERO_ERROR;
253     outData = new uint8_t[outDataSize];
254     uspoof_serialize(sc, outData, outDataSize, &status);
255 
256     // Copy the data format version numbers from the spoof data header into the UDataMemory header.
257 
258     uprv_memcpy(dh.info.formatVersion,
259                 reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
260                 sizeof(dh.info.formatVersion));
261 
262     //
263     //  Create the output file
264     //
265     size_t bytesWritten;
266     UNewDataMemory *pData;
267     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
268     if(U_FAILURE(status)) {
269         fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
270                          outFileName, u_errorName(status));
271         exit(status);
272     }
273 
274 
275     //  Write the data itself.
276     udata_writeBlock(pData, outData, outDataSize);
277     // finish up
278     bytesWritten = udata_finish(pData, &status);
279     if(U_FAILURE(status)) {
280         fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
281         exit(status);
282     }
283 
284     if (bytesWritten != outDataSize) {
285         fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
286         exit(-1);
287     }
288 
289     uspoof_close(sc);
290     delete [] outData;
291     delete [] confusables;
292     u_cleanup();
293     if (!quiet) {
294         printf("gencfu: tool completed successfully.\n");
295     }
296     return 0;
297 #endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
298 }
299 
300 
301  //
302  //  Read in a confusables source file
303  //
readFile(const char * fileName,int32_t * len)304  static const char *readFile(const char *fileName, int32_t *len) {
305     char       *result;
306     long        fileSize;
307     FILE        *file;
308 
309     file = fopen(fileName, "rb");
310     if( file == 0 ) {
311         return NULL;
312     }
313     fseek(file, 0, SEEK_END);
314     fileSize = ftell(file);
315     fseek(file, 0, SEEK_SET);
316     result = new char[fileSize+10];
317     if (result==NULL) {
318         fclose(file);
319         return NULL;
320     }
321 
322     long t = static_cast<long>(fread(result, 1, fileSize, file));
323     if (t != fileSize)  {
324         delete [] result;
325         fclose(file);
326         return NULL;
327     }
328     result[fileSize]=0;
329     *len = static_cast<int32_t>(fileSize);
330     fclose(file);
331     return result;
332  }
333