1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uparse.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000apr18
16 *   created by: Markus W. Scherer
17 *
18 *   This file provides a parser for files that are delimited by one single
19 *   character like ';' or TAB. Example: the Unicode Character Properties files
20 *   like UnicodeData.txt are semicolon-delimited.
21 */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utf16.h"
27 #include "cstring.h"
28 #include "filestrm.h"
29 #include "uparse.h"
30 #include "ustr_imp.h"
31 
32 #include <stdio.h>
33 
34 U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char * s)35 u_skipWhitespace(const char *s) {
36     while(U_IS_INV_WHITESPACE(*s)) {
37         ++s;
38     }
39     return s;
40 }
41 
42 U_CAPI char * U_EXPORT2
u_rtrim(char * s)43 u_rtrim(char *s) {
44     char *end=uprv_strchr(s, 0);
45     while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
46         *--end = 0;
47     }
48     return end;
49 }
50 
51 /*
52  * If the string starts with # @missing: then return the pointer to the
53  * following non-whitespace character.
54  * Otherwise return the original pointer.
55  * Unicode 5.0 adds such lines in some data files to document
56  * default property values.
57  * Poor man's regex for variable amounts of white space.
58  */
59 static const char *
getMissingLimit(const char * s)60 getMissingLimit(const char *s) {
61     const char *s0=s;
62     if(
63         *(s=u_skipWhitespace(s))=='#' &&
64         *(s=u_skipWhitespace(s+1))=='@' &&
65         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
66         *(s=u_skipWhitespace(s+7))==':'
67     ) {
68         return u_skipWhitespace(s+1);
69     } else {
70         return s0;
71     }
72 }
73 
74 U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char * filename,char delimiter,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)75 u_parseDelimitedFile(const char *filename, char delimiter,
76                      char *fields[][2], int32_t fieldCount,
77                      UParseLineFn *lineFn, void *context,
78                      UErrorCode *pErrorCode) {
79     FileStream *file;
80     char line[10000];
81     char *start, *limit;
82     int32_t i, length;
83 
84     if(U_FAILURE(*pErrorCode)) {
85         return;
86     }
87 
88     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
89         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
90         return;
91     }
92 
93     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
94         filename=NULL;
95         file=T_FileStream_stdin();
96     } else {
97         file=T_FileStream_open(filename, "r");
98     }
99     if(file==NULL) {
100         *pErrorCode=U_FILE_ACCESS_ERROR;
101         return;
102     }
103 
104     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
105         /* remove trailing newline characters */
106         length=(int32_t)(u_rtrim(line)-line);
107 
108         /*
109          * detect a line with # @missing:
110          * start parsing after that, or else from the beginning of the line
111          * set the default warning for @missing lines
112          */
113         start=(char *)getMissingLimit(line);
114         if(start==line) {
115             *pErrorCode=U_ZERO_ERROR;
116         } else {
117             *pErrorCode=U_USING_DEFAULT_WARNING;
118         }
119 
120         /* skip this line if it is empty or a comment */
121         if(*start==0 || *start=='#') {
122             continue;
123         }
124 
125         /* remove in-line comments */
126         limit=uprv_strchr(start, '#');
127         if(limit!=NULL) {
128             /* get white space before the pound sign */
129             while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
130                 --limit;
131             }
132 
133             /* truncate the line */
134             *limit=0;
135         }
136 
137         /* skip lines with only whitespace */
138         if(u_skipWhitespace(start)[0]==0) {
139             continue;
140         }
141 
142         /* for each field, call the corresponding field function */
143         for(i=0; i<fieldCount; ++i) {
144             /* set the limit pointer of this field */
145             limit=start;
146             while(*limit!=delimiter && *limit!=0) {
147                 ++limit;
148             }
149 
150             /* set the field start and limit in the fields array */
151             fields[i][0]=start;
152             fields[i][1]=limit;
153 
154             /* set start to the beginning of the next field, if any */
155             start=limit;
156             if(*start!=0) {
157                 ++start;
158             } else if(i+1<fieldCount) {
159                 *pErrorCode=U_PARSE_ERROR;
160                 limit=line+length;
161                 i=fieldCount;
162                 break;
163             }
164         }
165 
166         /* too few fields? */
167         if(U_FAILURE(*pErrorCode)) {
168             break;
169         }
170 
171         /* call the field function */
172         lineFn(context, fields, fieldCount, pErrorCode);
173         if(U_FAILURE(*pErrorCode)) {
174             break;
175         }
176     }
177 
178     if(filename!=NULL) {
179         T_FileStream_close(file);
180     }
181 }
182 
183 /*
184  * parse a list of code points
185  * store them as a UTF-32 string in dest[destCapacity]
186  * return the number of code points
187  */
188 U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char * s,uint32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)189 u_parseCodePoints(const char *s,
190                   uint32_t *dest, int32_t destCapacity,
191                   UErrorCode *pErrorCode) {
192     char *end;
193     uint32_t value;
194     int32_t count;
195 
196     if(U_FAILURE(*pErrorCode)) {
197         return 0;
198     }
199     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
200         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201         return 0;
202     }
203 
204     count=0;
205     for(;;) {
206         s=u_skipWhitespace(s);
207         if(*s==';' || *s==0) {
208             return count;
209         }
210 
211         /* read one code point */
212         value=(uint32_t)uprv_strtoul(s, &end, 16);
213         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
214             *pErrorCode=U_PARSE_ERROR;
215             return 0;
216         }
217 
218         /* append it to the destination array */
219         if(count<destCapacity) {
220             dest[count++]=value;
221         } else {
222             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
223         }
224 
225         /* go to the following characters */
226         s=end;
227     }
228 }
229 
230 /*
231  * parse a list of code points
232  * store them as a string in dest[destCapacity]
233  * set the first code point in *pFirst
234  * @return The length of the string in numbers of UChars.
235  */
236 U_CAPI int32_t U_EXPORT2
u_parseString(const char * s,UChar * dest,int32_t destCapacity,uint32_t * pFirst,UErrorCode * pErrorCode)237 u_parseString(const char *s,
238               UChar *dest, int32_t destCapacity,
239               uint32_t *pFirst,
240               UErrorCode *pErrorCode) {
241     char *end;
242     uint32_t value;
243     int32_t destLength;
244 
245     if(U_FAILURE(*pErrorCode)) {
246         return 0;
247     }
248     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
249         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
250         return 0;
251     }
252 
253     if(pFirst!=NULL) {
254         *pFirst=0xffffffff;
255     }
256 
257     destLength=0;
258     for(;;) {
259         s=u_skipWhitespace(s);
260         if(*s==';' || *s==0) {
261             if(destLength<destCapacity) {
262                 dest[destLength]=0;
263             } else if(destLength==destCapacity) {
264                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
265             } else {
266                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267             }
268             return destLength;
269         }
270 
271         /* read one code point */
272         value=(uint32_t)uprv_strtoul(s, &end, 16);
273         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
274             *pErrorCode=U_PARSE_ERROR;
275             return 0;
276         }
277 
278         /* store the first code point */
279         if(pFirst!=NULL) {
280             *pFirst=value;
281             pFirst=NULL;
282         }
283 
284         /* append it to the destination array */
285         if((destLength+U16_LENGTH(value))<=destCapacity) {
286             U16_APPEND_UNSAFE(dest, destLength, value);
287         } else {
288             destLength+=U16_LENGTH(value);
289         }
290 
291         /* go to the following characters */
292         s=end;
293     }
294 }
295 
296 /* read a range like start or start..end */
297 U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char * s,uint32_t * pStart,uint32_t * pEnd,const char ** terminator,UErrorCode * pErrorCode)298 u_parseCodePointRangeAnyTerminator(const char *s,
299                                    uint32_t *pStart, uint32_t *pEnd,
300                                    const char **terminator,
301                                    UErrorCode *pErrorCode) {
302     char *end;
303     uint32_t value;
304 
305     if(U_FAILURE(*pErrorCode)) {
306         return 0;
307     }
308     if(s==NULL || pStart==NULL || pEnd==NULL) {
309         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
310         return 0;
311     }
312 
313     /* read the start code point */
314     s=u_skipWhitespace(s);
315     value=(uint32_t)uprv_strtoul(s, &end, 16);
316     if(end<=s || value>=0x110000) {
317         *pErrorCode=U_PARSE_ERROR;
318         return 0;
319     }
320     *pStart=*pEnd=value;
321 
322     /* is there a "..end"? */
323     s=u_skipWhitespace(end);
324     if(*s!='.' || s[1]!='.') {
325         *terminator=end;
326         return 1;
327     }
328     s=u_skipWhitespace(s+2);
329 
330     /* read the end code point */
331     value=(uint32_t)uprv_strtoul(s, &end, 16);
332     if(end<=s || value>=0x110000) {
333         *pErrorCode=U_PARSE_ERROR;
334         return 0;
335     }
336     *pEnd=value;
337 
338     /* is this a valid range? */
339     if(value<*pStart) {
340         *pErrorCode=U_PARSE_ERROR;
341         return 0;
342     }
343 
344     *terminator=end;
345     return value-*pStart+1;
346 }
347 
348 U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char * s,uint32_t * pStart,uint32_t * pEnd,UErrorCode * pErrorCode)349 u_parseCodePointRange(const char *s,
350                       uint32_t *pStart, uint32_t *pEnd,
351                       UErrorCode *pErrorCode) {
352     const char *terminator;
353     int32_t rangeLength=
354         u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
355     if(U_SUCCESS(*pErrorCode)) {
356         terminator=u_skipWhitespace(terminator);
357         if(*terminator!=';' && *terminator!=0) {
358             *pErrorCode=U_PARSE_ERROR;
359             return 0;
360         }
361     }
362     return rangeLength;
363 }
364 
365 U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char * source,int32_t sLen,char * dest,int32_t destCapacity,UErrorCode * status)366 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
367     const char *read = source;
368     int32_t i = 0;
369     unsigned int value = 0;
370     if(sLen == -1) {
371         sLen = (int32_t)strlen(source);
372     }
373 
374     while(read < source+sLen) {
375         sscanf(read, "%2x", &value);
376         if(i < destCapacity) {
377             dest[i] = (char)value;
378         }
379         i++;
380         read += 2;
381     }
382     return u_terminateChars(dest, destCapacity, i, status);
383 }
384