1 /******************************************************************************
2  *   Copyright (C) 2008-2012, International Business Machines
3  *   Corporation and others.  All Rights Reserved.
4  *******************************************************************************
5  */
6 #include "unicode/utypes.h"
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include "unicode/utypes.h"
11 #include "unicode/putil.h"
12 #include "cmemory.h"
13 #include "cstring.h"
14 #include "filestrm.h"
15 #include "toolutil.h"
16 #include "unicode/uclean.h"
17 #include "unewdata.h"
18 #include "putilimp.h"
19 #include "pkg_gencmn.h"
20 
21 #define STRING_STORE_SIZE 200000
22 
23 #define COMMON_DATA_NAME U_ICUDATA_NAME
24 #define DATA_TYPE "dat"
25 
26 /* ICU package data file format (.dat files) ------------------------------- ***
27 
28 Description of the data format after the usual ICU data file header
29 (UDataInfo etc.).
30 
31 Format version 1
32 
33 A .dat package file contains a simple Table of Contents of item names,
34 followed by the items themselves:
35 
36 1. ToC table
37 
38 uint32_t count; - number of items
39 UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
40     uint32_t nameOffset; - offset of the item name
41     uint32_t dataOffset; - offset of the item data
42 both are byte offsets from the beginning of the data
43 
44 2. item name strings
45 
46 All item names are stored as char * strings in one block between the ToC table
47 and the data items.
48 
49 3. data items
50 
51 The data items are stored following the item names block.
52 Each data item is 16-aligned.
53 The data items are stored in the sorted order of their names.
54 
55 Therefore, the top of the name strings block is the offset of the first item,
56 the length of the last item is the difference between its offset and
57 the .dat file length, and the length of all previous items is the difference
58 between its offset and the next one.
59 
60 ----------------------------------------------------------------------------- */
61 
62 /* UDataInfo cf. udata.h */
63 static const UDataInfo dataInfo={
64     sizeof(UDataInfo),
65     0,
66 
67     U_IS_BIG_ENDIAN,
68     U_CHARSET_FAMILY,
69     sizeof(UChar),
70     0,
71 
72     {0x43, 0x6d, 0x6e, 0x44},     /* dataFormat="CmnD" */
73     {1, 0, 0, 0},                 /* formatVersion */
74     {3, 0, 0, 0}                  /* dataVersion */
75 };
76 
77 static uint32_t maxSize;
78 
79 static char stringStore[STRING_STORE_SIZE];
80 static uint32_t stringTop=0, basenameTotal=0;
81 
82 typedef struct {
83     char *pathname, *basename;
84     uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
85 } File;
86 
87 #define CHUNK_FILE_COUNT 256
88 static File *files = NULL;
89 static uint32_t fileCount=0;
90 static uint32_t fileMax = 0;
91 
92 
93 static char *symPrefix = NULL;
94 
95 #define LINE_BUFFER_SIZE 512
96 /* prototypes --------------------------------------------------------------- */
97 
98 static void
99 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
100 
101 static char *
102 allocString(uint32_t length);
103 
104 static int
105 compareFiles(const void *file1, const void *file2);
106 
107 static char *
108 pathToFullPath(const char *path, const char *source);
109 
110 /* map non-tree separator (such as '\') to tree separator ('/') inplace. */
111 static void
112 fixDirToTreePath(char *s);
113 /* -------------------------------------------------------------------------- */
114 
115 U_CAPI void U_EXPORT2
createCommonDataFile(const char * destDir,const char * name,const char * entrypointName,const char * type,const char * source,const char * copyRight,const char * dataFile,uint32_t max_size,UBool sourceTOC,UBool verbose,char * gencmnFileName)116 createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
117                      const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
118     static char buffer[4096];
119     char *line;
120     char *linePtr;
121     char *s = NULL;
122     UErrorCode errorCode=U_ZERO_ERROR;
123     uint32_t i, fileOffset, basenameOffset, length, nread;
124     FileStream *in, *file;
125 
126     line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
127     if (line == NULL) {
128         fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
129         exit(U_MEMORY_ALLOCATION_ERROR);
130     }
131 
132     linePtr = line;
133 
134     maxSize = max_size;
135 
136     if (destDir == NULL) {
137         destDir = u_getDataDirectory();
138     }
139     if (name == NULL) {
140         name = COMMON_DATA_NAME;
141     }
142     if (type == NULL) {
143         type = DATA_TYPE;
144     }
145     if (source == NULL) {
146         source = ".";
147     }
148 
149     if (dataFile == NULL) {
150         in = T_FileStream_stdin();
151     } else {
152         in = T_FileStream_open(dataFile, "r");
153         if(in == NULL) {
154             fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
155             exit(U_FILE_ACCESS_ERROR);
156         }
157     }
158 
159     if (verbose) {
160         if(sourceTOC) {
161             printf("generating %s_%s.c (table of contents source file)\n", name, type);
162         } else {
163             printf("generating %s.%s (common data file with table of contents)\n", name, type);
164         }
165     }
166 
167     /* read the list of files and get their lengths */
168     while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
169                                                              LINE_BUFFER_SIZE))!=NULL) {
170         /* remove trailing newline characters and parse space separated items */
171         if (s != NULL && *s != 0) {
172             line=s;
173         } else {
174             s=line;
175         }
176         while(*s!=0) {
177             if(*s==' ') {
178                 *s=0;
179                 ++s;
180                 break;
181             } else if(*s=='\r' || *s=='\n') {
182                 *s=0;
183                 break;
184             }
185             ++s;
186         }
187 
188         /* check for comment */
189 
190         if (*line == '#') {
191             continue;
192         }
193 
194         /* add the file */
195 #if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
196         {
197           char *t;
198           while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
199             *t = U_FILE_SEP_CHAR;
200           }
201         }
202 #endif
203         addFile(getLongPathname(line), name, source, sourceTOC, verbose);
204     }
205 
206     uprv_free(linePtr);
207 
208     if(in!=T_FileStream_stdin()) {
209         T_FileStream_close(in);
210     }
211 
212     if(fileCount==0) {
213         fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
214         return;
215     }
216 
217     /* sort the files by basename */
218     qsort(files, fileCount, sizeof(File), compareFiles);
219 
220     if(!sourceTOC) {
221         UNewDataMemory *out;
222 
223         /* determine the offsets of all basenames and files in this common one */
224         basenameOffset=4+8*fileCount;
225         fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
226         for(i=0; i<fileCount; ++i) {
227             files[i].fileOffset=fileOffset;
228             fileOffset+=(files[i].fileSize+15)&~0xf;
229             files[i].basenameOffset=basenameOffset;
230             basenameOffset+=files[i].basenameLength;
231         }
232 
233         /* create the output file */
234         out=udata_create(destDir, type, name,
235                          &dataInfo,
236                          copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
237                          &errorCode);
238         if(U_FAILURE(errorCode)) {
239             fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
240                 destDir, name, type,
241                 u_errorName(errorCode));
242             exit(errorCode);
243         }
244 
245         /* write the table of contents */
246         udata_write32(out, fileCount);
247         for(i=0; i<fileCount; ++i) {
248             udata_write32(out, files[i].basenameOffset);
249             udata_write32(out, files[i].fileOffset);
250         }
251 
252         /* write the basenames */
253         for(i=0; i<fileCount; ++i) {
254             udata_writeString(out, files[i].basename, files[i].basenameLength);
255         }
256         length=4+8*fileCount+basenameTotal;
257 
258         /* copy the files */
259         for(i=0; i<fileCount; ++i) {
260             /* pad to 16-align the next file */
261             length&=0xf;
262             if(length!=0) {
263                 udata_writePadding(out, 16-length);
264             }
265 
266             if (verbose) {
267                 printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
268             }
269 
270             /* copy the next file */
271             file=T_FileStream_open(files[i].pathname, "rb");
272             if(file==NULL) {
273                 fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
274                 exit(U_FILE_ACCESS_ERROR);
275             }
276             for(nread = 0;;) {
277                 length=T_FileStream_read(file, buffer, sizeof(buffer));
278                 if(length <= 0) {
279                     break;
280                 }
281                 nread += length;
282                 udata_writeBlock(out, buffer, length);
283             }
284             T_FileStream_close(file);
285             length=files[i].fileSize;
286 
287             if (nread != files[i].fileSize) {
288               fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname,  (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
289                 exit(U_FILE_ACCESS_ERROR);
290             }
291         }
292 
293         /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
294         length&=0xf;
295         if(length!=0) {
296             udata_writePadding(out, 16-length);
297         }
298 
299         /* finish */
300         udata_finish(out, &errorCode);
301         if(U_FAILURE(errorCode)) {
302             fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
303             exit(errorCode);
304         }
305     } else {
306         /* write a .c source file with the table of contents */
307         char *filename;
308         FileStream *out;
309 
310         /* create the output filename */
311         filename=s=buffer;
312         uprv_strcpy(filename, destDir);
313         s=filename+uprv_strlen(filename);
314         if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
315             *s++=U_FILE_SEP_CHAR;
316         }
317         uprv_strcpy(s, name);
318         if(*(type)!=0) {
319             s+=uprv_strlen(s);
320             *s++='_';
321             uprv_strcpy(s, type);
322         }
323         s+=uprv_strlen(s);
324         uprv_strcpy(s, ".c");
325 
326         /* open the output file */
327         out=T_FileStream_open(filename, "w");
328         if (gencmnFileName != NULL) {
329             uprv_strcpy(gencmnFileName, filename);
330         }
331         if(out==NULL) {
332             fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
333             exit(U_FILE_ACCESS_ERROR);
334         }
335 
336         /* write the source file */
337         sprintf(buffer,
338             "/*\n"
339             " * ICU common data table of contents for %s.%s\n"
340             " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
341             " */\n\n"
342             "#include \"unicode/utypes.h\"\n"
343             "#include \"unicode/udata.h\"\n"
344             "\n"
345             "/* external symbol declarations for data (%d files) */\n",
346                 name, type, fileCount);
347         T_FileStream_writeLine(out, buffer);
348 
349         sprintf(buffer, "extern const char\n    %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
350         T_FileStream_writeLine(out, buffer);
351         for(i=1; i<fileCount; ++i) {
352             sprintf(buffer, ",\n    %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
353             T_FileStream_writeLine(out, buffer);
354         }
355         T_FileStream_writeLine(out, ";\n\n");
356 
357         sprintf(
358             buffer,
359             "U_EXPORT struct {\n"
360             "    uint16_t headerSize;\n"
361             "    uint8_t magic1, magic2;\n"
362             "    UDataInfo info;\n"
363             "    char padding[%lu];\n"
364             "    uint32_t count, reserved;\n"
365             "    struct {\n"
366             "        const char *name;\n"
367             "        const void *data;\n"
368             "    } toc[%lu];\n"
369             "} U_EXPORT2 %s_dat = {\n"
370             "    32, 0xda, 0x27, {\n"
371             "        %lu, 0,\n"
372             "        %u, %u, %u, 0,\n"
373             "        {0x54, 0x6f, 0x43, 0x50},\n"
374             "        {1, 0, 0, 0},\n"
375             "        {0, 0, 0, 0}\n"
376             "    },\n"
377             "    \"\", %lu, 0, {\n",
378             (unsigned long)32-4-sizeof(UDataInfo),
379             (unsigned long)fileCount,
380             entrypointName,
381             (unsigned long)sizeof(UDataInfo),
382             U_IS_BIG_ENDIAN,
383             U_CHARSET_FAMILY,
384             U_SIZEOF_UCHAR,
385             (unsigned long)fileCount
386         );
387         T_FileStream_writeLine(out, buffer);
388 
389         sprintf(buffer, "        { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
390         T_FileStream_writeLine(out, buffer);
391         for(i=1; i<fileCount; ++i) {
392             sprintf(buffer, ",\n        { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
393             T_FileStream_writeLine(out, buffer);
394         }
395 
396         T_FileStream_writeLine(out, "\n    }\n};\n");
397         T_FileStream_close(out);
398 
399         uprv_free(symPrefix);
400     }
401 }
402 
403 static void
addFile(const char * filename,const char * name,const char * source,UBool sourceTOC,UBool verbose)404 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
405     char *s;
406     uint32_t length;
407     char *fullPath = NULL;
408 
409     if(fileCount==fileMax) {
410       fileMax += CHUNK_FILE_COUNT;
411       files = uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
412       if(files==NULL) {
413         fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
414         exit(U_MEMORY_ALLOCATION_ERROR);
415       }
416     }
417 
418     if(!sourceTOC) {
419         FileStream *file;
420 
421         if(uprv_pathIsAbsolute(filename)) {
422             fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
423             exit(U_ILLEGAL_ARGUMENT_ERROR);
424         }
425         fullPath = pathToFullPath(filename, source);
426         /* store the pathname */
427         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
428         s=allocString(length);
429         uprv_strcpy(s, name);
430         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
431         uprv_strcat(s, filename);
432 
433         /* get the basename */
434         fixDirToTreePath(s);
435         files[fileCount].basename=s;
436         files[fileCount].basenameLength=length;
437 
438         files[fileCount].pathname=fullPath;
439 
440         basenameTotal+=length;
441 
442         /* try to open the file */
443         file=T_FileStream_open(fullPath, "rb");
444         if(file==NULL) {
445             fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
446             exit(U_FILE_ACCESS_ERROR);
447         }
448 
449         /* get the file length */
450         length=T_FileStream_size(file);
451         if(T_FileStream_error(file) || length<=20) {
452             fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
453             exit(U_FILE_ACCESS_ERROR);
454         }
455 
456         T_FileStream_close(file);
457 
458         /* do not add files that are longer than maxSize */
459         if(maxSize && length>maxSize) {
460             if (verbose) {
461                 printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
462             }
463             return;
464         }
465         files[fileCount].fileSize=length;
466     } else {
467         char *t;
468         /* get and store the basename */
469         /* need to include the package name */
470         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
471         s=allocString(length);
472         uprv_strcpy(s, name);
473         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
474         uprv_strcat(s, filename);
475         fixDirToTreePath(s);
476         files[fileCount].basename=s;
477         /* turn the basename into an entry point name and store in the pathname field */
478         t=files[fileCount].pathname=allocString(length);
479         while(--length>0) {
480             if(*s=='.' || *s=='-' || *s=='/') {
481                 *t='_';
482             } else {
483                 *t=*s;
484             }
485             ++s;
486             ++t;
487         }
488         *t=0;
489     }
490     ++fileCount;
491 }
492 
493 static char *
allocString(uint32_t length)494 allocString(uint32_t length) {
495     uint32_t top=stringTop+length;
496     char *p;
497 
498     if(top>STRING_STORE_SIZE) {
499         fprintf(stderr, "gencmn: out of memory\n");
500         exit(U_MEMORY_ALLOCATION_ERROR);
501     }
502     p=stringStore+stringTop;
503     stringTop=top;
504     return p;
505 }
506 
507 static char *
pathToFullPath(const char * path,const char * source)508 pathToFullPath(const char *path, const char *source) {
509     int32_t length;
510     int32_t newLength;
511     char *fullPath;
512     int32_t n;
513 
514     length = (uint32_t)(uprv_strlen(path) + 1);
515     newLength = (length + 1 + (int32_t)uprv_strlen(source));
516     fullPath = uprv_malloc(newLength);
517     if(source != NULL) {
518         uprv_strcpy(fullPath, source);
519         uprv_strcat(fullPath, U_FILE_SEP_STRING);
520     } else {
521         fullPath[0] = 0;
522     }
523     n = (int32_t)uprv_strlen(fullPath);
524     fullPath[n] = 0;       /* Suppress compiler warning for unused variable n    */
525                            /*  when conditional code below is not compiled.      */
526     uprv_strcat(fullPath, path);
527 
528 #if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
529 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
530     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
531     for(;fullPath[n];n++) {
532         if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
533             fullPath[n] = U_FILE_SEP_CHAR;
534         }
535     }
536 #endif
537 #endif
538 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
539     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
540     for(;fullPath[n];n++) {
541         if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
542             fullPath[n] = U_FILE_SEP_CHAR;
543         }
544     }
545 #endif
546     return fullPath;
547 }
548 
549 static int
compareFiles(const void * file1,const void * file2)550 compareFiles(const void *file1, const void *file2) {
551     /* sort by basename */
552     return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
553 }
554 
555 static void
fixDirToTreePath(char * s)556 fixDirToTreePath(char *s)
557 {
558 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
559     char *t;
560 #endif
561 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
562     for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
563         *t = U_TREE_ENTRY_SEP_CHAR;
564     }
565 #endif
566 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
567     for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
568         *t = U_TREE_ENTRY_SEP_CHAR;
569     }
570 #endif
571 }
572