1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *
7 * File genbrk.c
8 */
9 
10 //--------------------------------------------------------------------
11 //
12 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
13 //   .brk files contain the precompiled rules for standard types
14 //   of iterators - word, line, sentence, etc.
15 //
16 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
17 //
18 //       options:   -v         verbose
19 //                  -? or -h   help
20 //
21 //   The input rule file is a plain text file containing break rules
22 //    in the input format accepted by RuleBasedBreakIterators.  The
23 //    file can be encoded as utf-8, or utf-16 (either endian), or
24 //    in the default code page (platform dependent.).  utf encoded
25 //    files must include a BOM.
26 //
27 //--------------------------------------------------------------------
28 
29 #include "unicode/utypes.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/unistr.h"
32 #include "unicode/rbbi.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36 
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "cmemory.h"
42 
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 
47 U_NAMESPACE_USE
48 
49 static char *progName;
50 static UOption options[]={
51     UOPTION_HELP_H,             /* 0 */
52     UOPTION_HELP_QUESTION_MARK, /* 1 */
53     UOPTION_VERBOSE,            /* 2 */
54     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
56     UOPTION_ICUDATADIR,         /* 5 */
57     UOPTION_DESTDIR,            /* 6 */
58     UOPTION_COPYRIGHT,          /* 7 */
59     UOPTION_QUIET,              /* 8 */
60 };
61 
usageAndDie(int retCode)62 void usageAndDie(int retCode) {
63         printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
64         printf("\tRead in break iteration rules text and write out the binary data\n"
65             "options:\n"
66             "\t-h or -? or --help  this usage text\n"
67             "\t-V or --version     show a version message\n"
68             "\t-c or --copyright   include a copyright notice\n"
69             "\t-v or --verbose     turn on verbose output\n"
70             "\t-q or --quiet       do not display warnings and progress\n"
71             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
72             "\t                    followed by path, defaults to %s\n"
73             "\t-d or --destdir     destination directory, followed by the path\n",
74             u_getDataDirectory());
75         exit (retCode);
76 }
77 
78 
79 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
80 
81 /* dummy UDataInfo cf. udata.h */
82 static UDataInfo dummyDataInfo = {
83     sizeof(UDataInfo),
84     0,
85 
86     U_IS_BIG_ENDIAN,
87     U_CHARSET_FAMILY,
88     U_SIZEOF_UCHAR,
89     0,
90 
91     { 0, 0, 0, 0 },                 /* dummy dataFormat */
92     { 0, 0, 0, 0 },                 /* dummy formatVersion */
93     { 0, 0, 0, 0 }                  /* dummy dataVersion */
94 };
95 
96 #else
97 
98 //
99 //  Set up the ICU data header, defined in ucmndata.h
100 //
101 DataHeader dh ={
102     {sizeof(DataHeader),           // Struct MappedData
103         0xda,
104         0x27},
105 
106     {                               // struct UDataInfo
107         sizeof(UDataInfo),          //     size
108         0,                          //     reserved
109         U_IS_BIG_ENDIAN,
110         U_CHARSET_FAMILY,
111         U_SIZEOF_UCHAR,
112         0,                          //     reserved
113 
114     { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
115     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
116                                     //      from the RBBI rule builder.  The  values declared
117                                     //      here should never appear in any real RBBI data.
118         { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
119     }};
120 
121 #endif
122 
123 //----------------------------------------------------------------------------
124 //
125 //  main      for genbrk
126 //
127 //----------------------------------------------------------------------------
main(int argc,char ** argv)128 int  main(int argc, char **argv) {
129     UErrorCode  status = U_ZERO_ERROR;
130     const char *ruleFileName;
131     const char *outFileName;
132     const char *outDir = NULL;
133     const char *copyright = NULL;
134 
135     //
136     // Pick up and check the command line arguments,
137     //    using the standard ICU tool utils option handling.
138     //
139     U_MAIN_INIT_ARGS(argc, argv);
140     progName = argv[0];
141     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
142     if(argc<0) {
143         // Unrecognized option
144         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
145         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
146     }
147 
148     if(options[0].doesOccur || options[1].doesOccur) {
149         //  -? or -h for help.
150         usageAndDie(0);
151     }
152 
153     if (!(options[3].doesOccur && options[4].doesOccur)) {
154         fprintf(stderr, "rule file and output file must both be specified.\n");
155         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
156     }
157     ruleFileName = options[3].value;
158     outFileName  = options[4].value;
159 
160     if (options[5].doesOccur) {
161         u_setDataDirectory(options[5].value);
162     }
163 
164     status = U_ZERO_ERROR;
165 
166     /* Combine the directory with the file name */
167     if(options[6].doesOccur) {
168         outDir = options[6].value;
169     }
170     if (options[7].doesOccur) {
171         copyright = U_COPYRIGHT_STRING;
172     }
173 
174 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
175 
176     UNewDataMemory *pData;
177     char msg[1024];
178 
179     /* write message with just the name */
180     sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
181     fprintf(stderr, "%s\n", msg);
182 
183     /* write the dummy data file */
184     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
185     udata_writeBlock(pData, msg, strlen(msg));
186     udata_finish(pData, &status);
187     return (int)status;
188 
189 #else
190     /* Initialize ICU */
191     u_init(&status);
192     if (U_FAILURE(status)) {
193         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
194             argv[0], u_errorName(status));
195         exit(1);
196     }
197     status = U_ZERO_ERROR;
198 
199     //
200     //  Read in the rule source file
201     //
202     long        result;
203     long        ruleFileSize;
204     FILE        *file;
205     char        *ruleBufferC;
206 
207     file = fopen(ruleFileName, "rb");
208     if( file == 0 ) {
209         fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
210         exit(-1);
211     }
212     fseek(file, 0, SEEK_END);
213     ruleFileSize = ftell(file);
214     fseek(file, 0, SEEK_SET);
215     ruleBufferC = new char[ruleFileSize+10];
216 
217     result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
218     if (result != ruleFileSize)  {
219         fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
220         exit (-1);
221     }
222     ruleBufferC[ruleFileSize]=0;
223     fclose(file);
224 
225     //
226     // Look for a Unicode Signature (BOM) on the rule file
227     //
228     int32_t        signatureLength;
229     const char *   ruleSourceC = ruleBufferC;
230     const char*    encoding = ucnv_detectUnicodeSignature(
231                            ruleSourceC, ruleFileSize, &signatureLength, &status);
232     if (U_FAILURE(status)) {
233         exit(status);
234     }
235     if(encoding!=NULL ){
236         ruleSourceC  += signatureLength;
237         ruleFileSize -= signatureLength;
238     }
239 
240     //
241     // Open a converter to take the rule file to UTF-16
242     //
243     UConverter* conv;
244     conv = ucnv_open(encoding, &status);
245     if (U_FAILURE(status)) {
246         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
247         exit(status);
248     }
249 
250     //
251     // Convert the rules to UChar.
252     //  Preflight first to determine required buffer size.
253     //
254     uint32_t destCap = ucnv_toUChars(conv,
255                        NULL,           //  dest,
256                        0,              //  destCapacity,
257                        ruleSourceC,
258                        ruleFileSize,
259                        &status);
260     if (status != U_BUFFER_OVERFLOW_ERROR) {
261         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
262         exit(status);
263     };
264 
265     status = U_ZERO_ERROR;
266     UChar *ruleSourceU = new UChar[destCap+1];
267     ucnv_toUChars(conv,
268                   ruleSourceU,     //  dest,
269                   destCap+1,
270                   ruleSourceC,
271                   ruleFileSize,
272                   &status);
273     if (U_FAILURE(status)) {
274         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
275         exit(status);
276     };
277     ucnv_close(conv);
278 
279 
280     //
281     //  Put the source rules into a UnicodeString
282     //
283     UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
284 
285     //
286     //  Create the break iterator from the rules
287     //     This will compile the rules.
288     //
289     UParseError parseError;
290     parseError.line = 0;
291     parseError.offset = 0;
292     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
293     if (U_FAILURE(status)) {
294         fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
295                 u_errorName(status), (int)parseError.line, (int)parseError.offset);
296         exit(status);
297     };
298 
299 
300     //
301     //  Get the compiled rule data from the break iterator.
302     //
303     uint32_t        outDataSize;
304     const uint8_t  *outData;
305     outData = bi->getBinaryRules(outDataSize);
306 
307     // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
308     uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
309 
310     //
311     //  Create the output file
312     //
313     size_t bytesWritten;
314     UNewDataMemory *pData;
315     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
316     if(U_FAILURE(status)) {
317         fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
318                          outFileName, u_errorName(status));
319         exit(status);
320     }
321 
322 
323     //  Write the data itself.
324     udata_writeBlock(pData, outData, outDataSize);
325     // finish up
326     bytesWritten = udata_finish(pData, &status);
327     if(U_FAILURE(status)) {
328         fprintf(stderr, "genbrk: error %d writing the output file\n", status);
329         exit(status);
330     }
331 
332     if (bytesWritten != outDataSize) {
333         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
334         exit(-1);
335     }
336 
337     delete bi;
338     delete[] ruleSourceU;
339     delete[] ruleBufferC;
340     u_cleanup();
341 
342 
343     if(!options[8].doesOccur) {
344         printf("genbrk: tool completed successfully.\n");
345     }
346     return 0;
347 
348 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
349 }
350 
351