1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1998-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   05/26/99    stephen     Creation.
17 *   5/10/01     Ram         removed ustdio dependency
18 *******************************************************************************
19 */
20 
21 #include "read.h"
22 #include "errmsg.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 
26 #define OPENBRACE    0x007B
27 #define CLOSEBRACE   0x007D
28 #define COMMA        0x002C
29 #define QUOTE        0x0022
30 #define ESCAPE       0x005C
31 #define SLASH        0x002F
32 #define ASTERISK     0x002A
33 #define SPACE        0x0020
34 #define COLON        0x003A
35 #define BADBOM       0xFFFE
36 #define CR           0x000D
37 #define LF           0x000A
38 
39 static int32_t lineCount;
40 
41 /* Protos */
42 static enum ETokenType getStringToken(UCHARBUF *buf,
43                                       UChar32 initialChar,
44                                       struct UString *token,
45                                       UErrorCode *status);
46 
47 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
48 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
49 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
50 static UBool   isWhitespace          (UChar32 c);
51 static UBool   isNewline             (UChar32 c);
52 
resetLineNumber()53 U_CFUNC void resetLineNumber() {
54     lineCount = 1;
55 }
56 
57 /* Read and return the next token from the stream.  If the token is of
58    type eString, fill in the token parameter with the token.  If the
59    token is eError, then the status parameter will contain the
60    specific error.  This will be eItemNotFound at the end of file,
61    indicating that all tokens have been returned.  This method will
62    never return eString twice in a row; instead, multiple adjacent
63    string tokens will be merged into one, with no intervening
64    space. */
65 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)66 getNextToken(UCHARBUF* buf,
67              struct UString *token,
68              uint32_t *linenumber, /* out: linenumber of token */
69              struct UString *comment,
70              UErrorCode *status) {
71     enum ETokenType result;
72     UChar32         c;
73 
74     if (U_FAILURE(*status)) {
75         return TOK_ERROR;
76     }
77 
78     /* Skip whitespace */
79     c = getNextChar(buf, TRUE, comment, status);
80 
81     if (U_FAILURE(*status)) {
82         return TOK_ERROR;
83     }
84 
85     *linenumber = lineCount;
86 
87     switch(c) {
88     case BADBOM:
89         return TOK_ERROR;
90     case OPENBRACE:
91         return TOK_OPEN_BRACE;
92     case CLOSEBRACE:
93         return TOK_CLOSE_BRACE;
94     case COMMA:
95         return TOK_COMMA;
96     case U_EOF:
97         return TOK_EOF;
98     case COLON:
99         return TOK_COLON;
100 
101     default:
102         result = getStringToken(buf, c, token, status);
103     }
104 
105     *linenumber = lineCount;
106     return result;
107 }
108 
109 /* Copy a string token into the given UnicodeString.  Upon entry, we
110    have already read the first character of the string token, which is
111    not a whitespace character (but may be a QUOTE or ESCAPE). This
112    function reads all subsequent characters that belong with this
113    string, and copy them into the token parameter. The other
114    important, and slightly convoluted purpose of this function is to
115    merge adjacent strings.  It looks forward a bit, and if the next
116    non comment, non whitespace item is a string, it reads it in as
117    well.  If two adjacent strings are quoted, they are merged without
118    intervening space.  Otherwise a single SPACE character is
119    inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)120 static enum ETokenType getStringToken(UCHARBUF* buf,
121                                       UChar32 initialChar,
122                                       struct UString *token,
123                                       UErrorCode *status) {
124     UBool    lastStringWasQuoted;
125     UChar32  c;
126     UChar    target[3] = { '\0' };
127     UChar    *pTarget   = target;
128     int      len=0;
129     UBool    isFollowingCharEscaped=FALSE;
130     UBool    isNLUnescaped = FALSE;
131     UChar32  prevC=0;
132 
133     /* We are guaranteed on entry that initialChar is not a whitespace
134        character. If we are at the EOF, or have some other problem, it
135        doesn't matter; we still want to validly return the initialChar
136        (if nothing else) as a string token. */
137 
138     if (U_FAILURE(*status)) {
139         return TOK_ERROR;
140     }
141 
142     /* setup */
143     lastStringWasQuoted = FALSE;
144     c = initialChar;
145     ustr_setlen(token, 0, status);
146 
147     if (U_FAILURE(*status)) {
148         return TOK_ERROR;
149     }
150 
151     for (;;) {
152         if (c == QUOTE) {
153             if (!lastStringWasQuoted && token->fLength > 0) {
154                 ustr_ucat(token, SPACE, status);
155 
156                 if (U_FAILURE(*status)) {
157                     return TOK_ERROR;
158                 }
159             }
160 
161             lastStringWasQuoted = TRUE;
162 
163             for (;;) {
164                 c = ucbuf_getc(buf,status);
165 
166                 /* EOF reached */
167                 if (c == U_EOF) {
168                     return TOK_EOF;
169                 }
170 
171                 /* Unterminated quoted strings */
172                 if (U_FAILURE(*status)) {
173                     return TOK_ERROR;
174                 }
175 
176                 if (c == QUOTE && !isFollowingCharEscaped) {
177                     break;
178                 }
179 
180                 if (c == ESCAPE  && !isFollowingCharEscaped) {
181                     pTarget = target;
182                     c       = unescape(buf, status);
183 
184                     if (c == U_ERR) {
185                         return TOK_ERROR;
186                     }
187                     if(c == CR || c == LF){
188                         isNLUnescaped = TRUE;
189                     }
190                 }
191 
192                 if(c==ESCAPE && !isFollowingCharEscaped){
193                     isFollowingCharEscaped = TRUE;
194                 }else{
195                     U_APPEND_CHAR32(c, pTarget,len);
196                     pTarget = target;
197                     ustr_uscat(token, pTarget,len, status);
198                     isFollowingCharEscaped = FALSE;
199                     len=0;
200                     if(c == CR || c == LF){
201                         if(isNLUnescaped == FALSE && prevC!=CR){
202                             lineCount++;
203                         }
204                         isNLUnescaped = FALSE;
205                     }
206                 }
207 
208                 if (U_FAILURE(*status)) {
209                     return TOK_ERROR;
210                 }
211                 prevC = c;
212             }
213         } else {
214             if (token->fLength > 0) {
215                 ustr_ucat(token, SPACE, status);
216 
217                 if (U_FAILURE(*status)) {
218                     return TOK_ERROR;
219                 }
220             }
221 
222             if(lastStringWasQuoted){
223                 if(getShowWarning()){
224                     warning(lineCount, "Mixing quoted and unquoted strings");
225                 }
226                 if(isStrict()){
227                     return TOK_ERROR;
228                 }
229 
230             }
231 
232             lastStringWasQuoted = FALSE;
233 
234             /* if we reach here we are mixing
235              * quoted and unquoted strings
236              * warn in normal mode and error in
237              * pedantic mode
238              */
239 
240             if (c == ESCAPE) {
241                 pTarget = target;
242                 c       = unescape(buf, status);
243 
244                 /* EOF reached */
245                 if (c == U_EOF) {
246                     return TOK_ERROR;
247                 }
248             }
249 
250             U_APPEND_CHAR32(c, pTarget,len);
251             pTarget = target;
252             ustr_uscat(token, pTarget,len, status);
253             len=0;
254 
255             if (U_FAILURE(*status)) {
256                 return TOK_ERROR;
257             }
258 
259             for (;;) {
260                 /* DON'T skip whitespace */
261                 c = getNextChar(buf, FALSE, NULL, status);
262 
263                 /* EOF reached */
264                 if (c == U_EOF) {
265                     ucbuf_ungetc(c, buf);
266                     return TOK_STRING;
267                 }
268 
269                 if (U_FAILURE(*status)) {
270                     return TOK_STRING;
271                 }
272 
273                 if (c == QUOTE
274                         || c == OPENBRACE
275                         || c == CLOSEBRACE
276                         || c == COMMA
277                         || c == COLON) {
278                     ucbuf_ungetc(c, buf);
279                     break;
280                 }
281 
282                 if (isWhitespace(c)) {
283                     break;
284                 }
285 
286                 if (c == ESCAPE) {
287                     pTarget = target;
288                     c       = unescape(buf, status);
289 
290                     if (c == U_ERR) {
291                         return TOK_ERROR;
292                     }
293                 }
294 
295                 U_APPEND_CHAR32(c, pTarget,len);
296                 pTarget = target;
297                 ustr_uscat(token, pTarget,len, status);
298                 len=0;
299                 if (U_FAILURE(*status)) {
300                     return TOK_ERROR;
301                 }
302             }
303         }
304 
305         /* DO skip whitespace */
306         c = getNextChar(buf, TRUE, NULL, status);
307 
308         if (U_FAILURE(*status)) {
309             return TOK_STRING;
310         }
311 
312         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
313             ucbuf_ungetc(c, buf);
314             return TOK_STRING;
315         }
316     }
317 }
318 
319 /* Retrieve the next character.  If skipwhite is
320    true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)321 static UChar32 getNextChar(UCHARBUF* buf,
322                            UBool skipwhite,
323                            struct UString *token,
324                            UErrorCode *status) {
325     UChar32 c, c2;
326 
327     if (U_FAILURE(*status)) {
328         return U_EOF;
329     }
330 
331     for (;;) {
332         c = ucbuf_getc(buf,status);
333 
334         if (c == U_EOF) {
335             return U_EOF;
336         }
337 
338         if (skipwhite && isWhitespace(c)) {
339             continue;
340         }
341 
342         /* This also handles the get() failing case */
343         if (c != SLASH) {
344             return c;
345         }
346 
347         c = ucbuf_getc(buf,status); /* "/c" */
348 
349         if (c == U_EOF) {
350             return U_EOF;
351         }
352 
353         switch (c) {
354         case SLASH:  /* "//" */
355             seekUntilNewline(buf, NULL, status);
356             break;
357 
358         case ASTERISK:  /* " / * " */
359             c2 = ucbuf_getc(buf, status); /* "/ * c" */
360             if(c2 == ASTERISK){  /* "/ * *" */
361                 /* parse multi-line comment and store it in token*/
362                 seekUntilEndOfComment(buf, token, status);
363             } else {
364                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
365                 seekUntilEndOfComment(buf, NULL, status);
366             }
367             break;
368 
369         default:
370             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
371             /* If get() failed this is a NOP */
372             return SLASH;
373         }
374 
375     }
376 }
377 
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)378 static void seekUntilNewline(UCHARBUF* buf,
379                              struct UString *token,
380                              UErrorCode *status) {
381     UChar32 c;
382 
383     if (U_FAILURE(*status)) {
384         return;
385     }
386 
387     do {
388         c = ucbuf_getc(buf,status);
389         /* add the char to token */
390         if(token!=NULL){
391             ustr_u32cat(token, c, status);
392         }
393     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
394 }
395 
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)396 static void seekUntilEndOfComment(UCHARBUF *buf,
397                                   struct UString *token,
398                                   UErrorCode *status) {
399     UChar32  c, d;
400     uint32_t line;
401 
402     if (U_FAILURE(*status)) {
403         return;
404     }
405 
406     line = lineCount;
407 
408     do {
409         c = ucbuf_getc(buf, status);
410 
411         if (c == ASTERISK) {
412             d = ucbuf_getc(buf, status);
413 
414             if (d != SLASH) {
415                 ucbuf_ungetc(d, buf);
416             } else {
417                 break;
418             }
419         }
420         /* add the char to token */
421         if(token!=NULL){
422             ustr_u32cat(token, c, status);
423         }
424         /* increment the lineCount */
425         isNewline(c);
426 
427     } while (c != U_EOF && *status == U_ZERO_ERROR);
428 
429     if (c == U_EOF) {
430         *status = U_INVALID_FORMAT_ERROR;
431         error(line, "unterminated comment detected");
432     }
433 }
434 
unescape(UCHARBUF * buf,UErrorCode * status)435 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
436     if (U_FAILURE(*status)) {
437         return U_EOF;
438     }
439 
440     /* We expect to be called after the ESCAPE has been seen, but
441      * u_fgetcx needs an ESCAPE to do its magic. */
442     ucbuf_ungetc(ESCAPE, buf);
443 
444     return ucbuf_getcx32(buf, status);
445 }
446 
isWhitespace(UChar32 c)447 static UBool isWhitespace(UChar32 c) {
448     switch (c) {
449         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
450     case 0x000A:
451     case 0x2029:
452         lineCount++;
453     case 0x000D:
454     case 0x0020:
455     case 0x0009:
456     case 0xFEFF:
457         return TRUE;
458 
459     default:
460         return FALSE;
461     }
462 }
463 
isNewline(UChar32 c)464 static UBool isNewline(UChar32 c) {
465     switch (c) {
466         /* '\n', '\r', 0x2029 */
467     case 0x000A:
468     case 0x2029:
469         lineCount++;
470     case 0x000D:
471         return TRUE;
472 
473     default:
474         return FALSE;
475     }
476 }
477