1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2000-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uparse.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000apr18
14 *   created by: Markus W. Scherer
15 *
16 *   This file provides a parser for files that are delimited by one single
17 *   character like ';' or TAB. Example: the Unicode Character Properties files
18 *   like UnicodeData.txt are semicolon-delimited.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cstring.h"
26 #include "filestrm.h"
27 #include "uparse.h"
28 #include "ustr_imp.h"
29 
30 #include <stdio.h>
31 
32 U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char * s)33 u_skipWhitespace(const char *s) {
34     while(U_IS_INV_WHITESPACE(*s)) {
35         ++s;
36     }
37     return s;
38 }
39 
40 U_CAPI char * U_EXPORT2
u_rtrim(char * s)41 u_rtrim(char *s) {
42     char *end=uprv_strchr(s, 0);
43     while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
44         *--end = 0;
45     }
46     return end;
47 }
48 
49 /*
50  * If the string starts with # @missing: then return the pointer to the
51  * following non-whitespace character.
52  * Otherwise return the original pointer.
53  * Unicode 5.0 adds such lines in some data files to document
54  * default property values.
55  * Poor man's regex for variable amounts of white space.
56  */
57 static const char *
getMissingLimit(const char * s)58 getMissingLimit(const char *s) {
59     const char *s0=s;
60     if(
61         *(s=u_skipWhitespace(s))=='#' &&
62         *(s=u_skipWhitespace(s+1))=='@' &&
63         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
64         *(s=u_skipWhitespace(s+7))==':'
65     ) {
66         return u_skipWhitespace(s+1);
67     } else {
68         return s0;
69     }
70 }
71 
72 U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char * filename,char delimiter,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)73 u_parseDelimitedFile(const char *filename, char delimiter,
74                      char *fields[][2], int32_t fieldCount,
75                      UParseLineFn *lineFn, void *context,
76                      UErrorCode *pErrorCode) {
77     FileStream *file;
78     char line[300];
79     char *start, *limit;
80     int32_t i, length;
81 
82     if(U_FAILURE(*pErrorCode)) {
83         return;
84     }
85 
86     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
87         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
88         return;
89     }
90 
91     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
92         filename=NULL;
93         file=T_FileStream_stdin();
94     } else {
95         file=T_FileStream_open(filename, "r");
96     }
97     if(file==NULL) {
98         *pErrorCode=U_FILE_ACCESS_ERROR;
99         return;
100     }
101 
102     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
103         /* remove trailing newline characters */
104         length=(int32_t)(u_rtrim(line)-line);
105 
106         /*
107          * detect a line with # @missing:
108          * start parsing after that, or else from the beginning of the line
109          * set the default warning for @missing lines
110          */
111         start=(char *)getMissingLimit(line);
112         if(start==line) {
113             *pErrorCode=U_ZERO_ERROR;
114         } else {
115             *pErrorCode=U_USING_DEFAULT_WARNING;
116         }
117 
118         /* skip this line if it is empty or a comment */
119         if(*start==0 || *start=='#') {
120             continue;
121         }
122 
123         /* remove in-line comments */
124         limit=uprv_strchr(start, '#');
125         if(limit!=NULL) {
126             /* get white space before the pound sign */
127             while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
128                 --limit;
129             }
130 
131             /* truncate the line */
132             *limit=0;
133         }
134 
135         /* skip lines with only whitespace */
136         if(u_skipWhitespace(start)[0]==0) {
137             continue;
138         }
139 
140         /* for each field, call the corresponding field function */
141         for(i=0; i<fieldCount; ++i) {
142             /* set the limit pointer of this field */
143             limit=start;
144             while(*limit!=delimiter && *limit!=0) {
145                 ++limit;
146             }
147 
148             /* set the field start and limit in the fields array */
149             fields[i][0]=start;
150             fields[i][1]=limit;
151 
152             /* set start to the beginning of the next field, if any */
153             start=limit;
154             if(*start!=0) {
155                 ++start;
156             } else if(i+1<fieldCount) {
157                 *pErrorCode=U_PARSE_ERROR;
158                 limit=line+length;
159                 i=fieldCount;
160                 break;
161             }
162         }
163 
164         /* error in a field function? */
165         if(U_FAILURE(*pErrorCode)) {
166             break;
167         }
168 
169         /* call the field function */
170         lineFn(context, fields, fieldCount, pErrorCode);
171         if(U_FAILURE(*pErrorCode)) {
172             break;
173         }
174     }
175 
176     if(filename!=NULL) {
177         T_FileStream_close(file);
178     }
179 }
180 
181 /*
182  * parse a list of code points
183  * store them as a UTF-32 string in dest[destCapacity]
184  * return the number of code points
185  */
186 U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char * s,uint32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)187 u_parseCodePoints(const char *s,
188                   uint32_t *dest, int32_t destCapacity,
189                   UErrorCode *pErrorCode) {
190     char *end;
191     uint32_t value;
192     int32_t count;
193 
194     if(U_FAILURE(*pErrorCode)) {
195         return 0;
196     }
197     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
198         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
199         return 0;
200     }
201 
202     count=0;
203     for(;;) {
204         s=u_skipWhitespace(s);
205         if(*s==';' || *s==0) {
206             return count;
207         }
208 
209         /* read one code point */
210         value=(uint32_t)uprv_strtoul(s, &end, 16);
211         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
212             *pErrorCode=U_PARSE_ERROR;
213             return 0;
214         }
215 
216         /* append it to the destination array */
217         if(count<destCapacity) {
218             dest[count++]=value;
219         } else {
220             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
221         }
222 
223         /* go to the following characters */
224         s=end;
225     }
226 }
227 
228 /*
229  * parse a list of code points
230  * store them as a string in dest[destCapacity]
231  * set the first code point in *pFirst
232  * @return The length of the string in numbers of UChars.
233  */
234 U_CAPI int32_t U_EXPORT2
u_parseString(const char * s,UChar * dest,int32_t destCapacity,uint32_t * pFirst,UErrorCode * pErrorCode)235 u_parseString(const char *s,
236               UChar *dest, int32_t destCapacity,
237               uint32_t *pFirst,
238               UErrorCode *pErrorCode) {
239     char *end;
240     uint32_t value;
241     int32_t destLength;
242 
243     if(U_FAILURE(*pErrorCode)) {
244         return 0;
245     }
246     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
247         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
248         return 0;
249     }
250 
251     if(pFirst!=NULL) {
252         *pFirst=0xffffffff;
253     }
254 
255     destLength=0;
256     for(;;) {
257         s=u_skipWhitespace(s);
258         if(*s==';' || *s==0) {
259             if(destLength<destCapacity) {
260                 dest[destLength]=0;
261             } else if(destLength==destCapacity) {
262                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
263             } else {
264                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265             }
266             return destLength;
267         }
268 
269         /* read one code point */
270         value=(uint32_t)uprv_strtoul(s, &end, 16);
271         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
272             *pErrorCode=U_PARSE_ERROR;
273             return 0;
274         }
275 
276         /* store the first code point */
277         if(pFirst!=NULL) {
278             *pFirst=value;
279             pFirst=NULL;
280         }
281 
282         /* append it to the destination array */
283         if((destLength+U16_LENGTH(value))<=destCapacity) {
284             U16_APPEND_UNSAFE(dest, destLength, value);
285         } else {
286             destLength+=U16_LENGTH(value);
287         }
288 
289         /* go to the following characters */
290         s=end;
291     }
292 }
293 
294 /* read a range like start or start..end */
295 U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char * s,uint32_t * pStart,uint32_t * pEnd,const char ** terminator,UErrorCode * pErrorCode)296 u_parseCodePointRangeAnyTerminator(const char *s,
297                                    uint32_t *pStart, uint32_t *pEnd,
298                                    const char **terminator,
299                                    UErrorCode *pErrorCode) {
300     char *end;
301     uint32_t value;
302 
303     if(U_FAILURE(*pErrorCode)) {
304         return 0;
305     }
306     if(s==NULL || pStart==NULL || pEnd==NULL) {
307         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
308         return 0;
309     }
310 
311     /* read the start code point */
312     s=u_skipWhitespace(s);
313     value=(uint32_t)uprv_strtoul(s, &end, 16);
314     if(end<=s || value>=0x110000) {
315         *pErrorCode=U_PARSE_ERROR;
316         return 0;
317     }
318     *pStart=*pEnd=value;
319 
320     /* is there a "..end"? */
321     s=u_skipWhitespace(end);
322     if(*s!='.' || s[1]!='.') {
323         *terminator=end;
324         return 1;
325     }
326     s=u_skipWhitespace(s+2);
327 
328     /* read the end code point */
329     value=(uint32_t)uprv_strtoul(s, &end, 16);
330     if(end<=s || value>=0x110000) {
331         *pErrorCode=U_PARSE_ERROR;
332         return 0;
333     }
334     *pEnd=value;
335 
336     /* is this a valid range? */
337     if(value<*pStart) {
338         *pErrorCode=U_PARSE_ERROR;
339         return 0;
340     }
341 
342     *terminator=end;
343     return value-*pStart+1;
344 }
345 
346 U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char * s,uint32_t * pStart,uint32_t * pEnd,UErrorCode * pErrorCode)347 u_parseCodePointRange(const char *s,
348                       uint32_t *pStart, uint32_t *pEnd,
349                       UErrorCode *pErrorCode) {
350     const char *terminator;
351     int32_t rangeLength=
352         u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
353     if(U_SUCCESS(*pErrorCode)) {
354         terminator=u_skipWhitespace(terminator);
355         if(*terminator!=';' && *terminator!=0) {
356             *pErrorCode=U_PARSE_ERROR;
357             return 0;
358         }
359     }
360     return rangeLength;
361 }
362 
363 U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char * source,int32_t sLen,char * dest,int32_t destCapacity,UErrorCode * status)364 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
365     const char *read = source;
366     int32_t i = 0;
367     unsigned int value = 0;
368     if(sLen == -1) {
369         sLen = (int32_t)strlen(source);
370     }
371 
372     while(read < source+sLen) {
373         sscanf(read, "%2x", &value);
374         if(i < destCapacity) {
375             dest[i] = (char)value;
376         }
377         i++;
378         read += 2;
379     }
380     return u_terminateChars(dest, destCapacity, i, status);
381 }
382