1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1998-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *
9 * File read.c
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   05/26/99    stephen     Creation.
15 *   5/10/01     Ram         removed ustdio dependency
16 *******************************************************************************
17 */
18 
19 #include "read.h"
20 #include "errmsg.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf16.h"
23 
24 #define OPENBRACE    0x007B
25 #define CLOSEBRACE   0x007D
26 #define COMMA        0x002C
27 #define QUOTE        0x0022
28 #define ESCAPE       0x005C
29 #define SLASH        0x002F
30 #define ASTERISK     0x002A
31 #define SPACE        0x0020
32 #define COLON        0x003A
33 #define BADBOM       0xFFFE
34 #define CR           0x000D
35 #define LF           0x000A
36 
37 static int32_t lineCount;
38 
39 /* Protos */
40 static enum ETokenType getStringToken(UCHARBUF *buf,
41                                       UChar32 initialChar,
42                                       struct UString *token,
43                                       UErrorCode *status);
44 
45 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
46 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
48 static UBool   isWhitespace          (UChar32 c);
49 static UBool   isNewline             (UChar32 c);
50 
resetLineNumber()51 U_CFUNC void resetLineNumber() {
52     lineCount = 1;
53 }
54 
55 /* Read and return the next token from the stream.  If the token is of
56    type eString, fill in the token parameter with the token.  If the
57    token is eError, then the status parameter will contain the
58    specific error.  This will be eItemNotFound at the end of file,
59    indicating that all tokens have been returned.  This method will
60    never return eString twice in a row; instead, multiple adjacent
61    string tokens will be merged into one, with no intervening
62    space. */
63 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)64 getNextToken(UCHARBUF* buf,
65              struct UString *token,
66              uint32_t *linenumber, /* out: linenumber of token */
67              struct UString *comment,
68              UErrorCode *status) {
69     enum ETokenType result;
70     UChar32         c;
71 
72     if (U_FAILURE(*status)) {
73         return TOK_ERROR;
74     }
75 
76     /* Skip whitespace */
77     c = getNextChar(buf, TRUE, comment, status);
78 
79     if (U_FAILURE(*status)) {
80         return TOK_ERROR;
81     }
82 
83     *linenumber = lineCount;
84 
85     switch(c) {
86     case BADBOM:
87         return TOK_ERROR;
88     case OPENBRACE:
89         return TOK_OPEN_BRACE;
90     case CLOSEBRACE:
91         return TOK_CLOSE_BRACE;
92     case COMMA:
93         return TOK_COMMA;
94     case U_EOF:
95         return TOK_EOF;
96     case COLON:
97         return TOK_COLON;
98 
99     default:
100         result = getStringToken(buf, c, token, status);
101     }
102 
103     *linenumber = lineCount;
104     return result;
105 }
106 
107 /* Copy a string token into the given UnicodeString.  Upon entry, we
108    have already read the first character of the string token, which is
109    not a whitespace character (but may be a QUOTE or ESCAPE). This
110    function reads all subsequent characters that belong with this
111    string, and copy them into the token parameter. The other
112    important, and slightly convoluted purpose of this function is to
113    merge adjacent strings.  It looks forward a bit, and if the next
114    non comment, non whitespace item is a string, it reads it in as
115    well.  If two adjacent strings are quoted, they are merged without
116    intervening space.  Otherwise a single SPACE character is
117    inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)118 static enum ETokenType getStringToken(UCHARBUF* buf,
119                                       UChar32 initialChar,
120                                       struct UString *token,
121                                       UErrorCode *status) {
122     UBool    lastStringWasQuoted;
123     UChar32  c;
124     UChar    target[3] = { '\0' };
125     UChar    *pTarget   = target;
126     int      len=0;
127     UBool    isFollowingCharEscaped=FALSE;
128     UBool    isNLUnescaped = FALSE;
129     UChar32  prevC=0;
130 
131     /* We are guaranteed on entry that initialChar is not a whitespace
132        character. If we are at the EOF, or have some other problem, it
133        doesn't matter; we still want to validly return the initialChar
134        (if nothing else) as a string token. */
135 
136     if (U_FAILURE(*status)) {
137         return TOK_ERROR;
138     }
139 
140     /* setup */
141     lastStringWasQuoted = FALSE;
142     c = initialChar;
143     ustr_setlen(token, 0, status);
144 
145     if (U_FAILURE(*status)) {
146         return TOK_ERROR;
147     }
148 
149     for (;;) {
150         if (c == QUOTE) {
151             if (!lastStringWasQuoted && token->fLength > 0) {
152                 ustr_ucat(token, SPACE, status);
153 
154                 if (U_FAILURE(*status)) {
155                     return TOK_ERROR;
156                 }
157             }
158 
159             lastStringWasQuoted = TRUE;
160 
161             for (;;) {
162                 c = ucbuf_getc(buf,status);
163 
164                 /* EOF reached */
165                 if (c == U_EOF) {
166                     return TOK_EOF;
167                 }
168 
169                 /* Unterminated quoted strings */
170                 if (U_FAILURE(*status)) {
171                     return TOK_ERROR;
172                 }
173 
174                 if (c == QUOTE && !isFollowingCharEscaped) {
175                     break;
176                 }
177 
178                 if (c == ESCAPE  && !isFollowingCharEscaped) {
179                     pTarget = target;
180                     c       = unescape(buf, status);
181 
182                     if (c == U_ERR) {
183                         return TOK_ERROR;
184                     }
185                     if(c == CR || c == LF){
186                         isNLUnescaped = TRUE;
187                     }
188                 }
189 
190                 if(c==ESCAPE && !isFollowingCharEscaped){
191                     isFollowingCharEscaped = TRUE;
192                 }else{
193                     U_APPEND_CHAR32(c, pTarget,len);
194                     pTarget = target;
195                     ustr_uscat(token, pTarget,len, status);
196                     isFollowingCharEscaped = FALSE;
197                     len=0;
198                     if(c == CR || c == LF){
199                         if(isNLUnescaped == FALSE && prevC!=CR){
200                             lineCount++;
201                         }
202                         isNLUnescaped = FALSE;
203                     }
204                 }
205 
206                 if (U_FAILURE(*status)) {
207                     return TOK_ERROR;
208                 }
209                 prevC = c;
210             }
211         } else {
212             if (token->fLength > 0) {
213                 ustr_ucat(token, SPACE, status);
214 
215                 if (U_FAILURE(*status)) {
216                     return TOK_ERROR;
217                 }
218             }
219 
220             if(lastStringWasQuoted){
221                 if(getShowWarning()){
222                     warning(lineCount, "Mixing quoted and unquoted strings");
223                 }
224                 if(isStrict()){
225                     return TOK_ERROR;
226                 }
227 
228             }
229 
230             lastStringWasQuoted = FALSE;
231 
232             /* if we reach here we are mixing
233              * quoted and unquoted strings
234              * warn in normal mode and error in
235              * pedantic mode
236              */
237 
238             if (c == ESCAPE) {
239                 pTarget = target;
240                 c       = unescape(buf, status);
241 
242                 /* EOF reached */
243                 if (c == U_EOF) {
244                     return TOK_ERROR;
245                 }
246             }
247 
248             U_APPEND_CHAR32(c, pTarget,len);
249             pTarget = target;
250             ustr_uscat(token, pTarget,len, status);
251             len=0;
252 
253             if (U_FAILURE(*status)) {
254                 return TOK_ERROR;
255             }
256 
257             for (;;) {
258                 /* DON'T skip whitespace */
259                 c = getNextChar(buf, FALSE, NULL, status);
260 
261                 /* EOF reached */
262                 if (c == U_EOF) {
263                     ucbuf_ungetc(c, buf);
264                     return TOK_STRING;
265                 }
266 
267                 if (U_FAILURE(*status)) {
268                     return TOK_STRING;
269                 }
270 
271                 if (c == QUOTE
272                         || c == OPENBRACE
273                         || c == CLOSEBRACE
274                         || c == COMMA
275                         || c == COLON) {
276                     ucbuf_ungetc(c, buf);
277                     break;
278                 }
279 
280                 if (isWhitespace(c)) {
281                     break;
282                 }
283 
284                 if (c == ESCAPE) {
285                     pTarget = target;
286                     c       = unescape(buf, status);
287 
288                     if (c == U_ERR) {
289                         return TOK_ERROR;
290                     }
291                 }
292 
293                 U_APPEND_CHAR32(c, pTarget,len);
294                 pTarget = target;
295                 ustr_uscat(token, pTarget,len, status);
296                 len=0;
297                 if (U_FAILURE(*status)) {
298                     return TOK_ERROR;
299                 }
300             }
301         }
302 
303         /* DO skip whitespace */
304         c = getNextChar(buf, TRUE, NULL, status);
305 
306         if (U_FAILURE(*status)) {
307             return TOK_STRING;
308         }
309 
310         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
311             ucbuf_ungetc(c, buf);
312             return TOK_STRING;
313         }
314     }
315 }
316 
317 /* Retrieve the next character.  If skipwhite is
318    true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)319 static UChar32 getNextChar(UCHARBUF* buf,
320                            UBool skipwhite,
321                            struct UString *token,
322                            UErrorCode *status) {
323     UChar32 c, c2;
324 
325     if (U_FAILURE(*status)) {
326         return U_EOF;
327     }
328 
329     for (;;) {
330         c = ucbuf_getc(buf,status);
331 
332         if (c == U_EOF) {
333             return U_EOF;
334         }
335 
336         if (skipwhite && isWhitespace(c)) {
337             continue;
338         }
339 
340         /* This also handles the get() failing case */
341         if (c != SLASH) {
342             return c;
343         }
344 
345         c = ucbuf_getc(buf,status); /* "/c" */
346 
347         if (c == U_EOF) {
348             return U_EOF;
349         }
350 
351         switch (c) {
352         case SLASH:  /* "//" */
353             seekUntilNewline(buf, NULL, status);
354             break;
355 
356         case ASTERISK:  /* " / * " */
357             c2 = ucbuf_getc(buf, status); /* "/ * c" */
358             if(c2 == ASTERISK){  /* "/ * *" */
359                 /* parse multi-line comment and store it in token*/
360                 seekUntilEndOfComment(buf, token, status);
361             } else {
362                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
363                 seekUntilEndOfComment(buf, NULL, status);
364             }
365             break;
366 
367         default:
368             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
369             /* If get() failed this is a NOP */
370             return SLASH;
371         }
372 
373     }
374 }
375 
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)376 static void seekUntilNewline(UCHARBUF* buf,
377                              struct UString *token,
378                              UErrorCode *status) {
379     UChar32 c;
380 
381     if (U_FAILURE(*status)) {
382         return;
383     }
384 
385     do {
386         c = ucbuf_getc(buf,status);
387         /* add the char to token */
388         if(token!=NULL){
389             ustr_u32cat(token, c, status);
390         }
391     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
392 }
393 
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)394 static void seekUntilEndOfComment(UCHARBUF *buf,
395                                   struct UString *token,
396                                   UErrorCode *status) {
397     UChar32  c, d;
398     uint32_t line;
399 
400     if (U_FAILURE(*status)) {
401         return;
402     }
403 
404     line = lineCount;
405 
406     do {
407         c = ucbuf_getc(buf, status);
408 
409         if (c == ASTERISK) {
410             d = ucbuf_getc(buf, status);
411 
412             if (d != SLASH) {
413                 ucbuf_ungetc(d, buf);
414             } else {
415                 break;
416             }
417         }
418         /* add the char to token */
419         if(token!=NULL){
420             ustr_u32cat(token, c, status);
421         }
422         /* increment the lineCount */
423         isNewline(c);
424 
425     } while (c != U_EOF && *status == U_ZERO_ERROR);
426 
427     if (c == U_EOF) {
428         *status = U_INVALID_FORMAT_ERROR;
429         error(line, "unterminated comment detected");
430     }
431 }
432 
unescape(UCHARBUF * buf,UErrorCode * status)433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
434     if (U_FAILURE(*status)) {
435         return U_EOF;
436     }
437 
438     /* We expect to be called after the ESCAPE has been seen, but
439      * u_fgetcx needs an ESCAPE to do its magic. */
440     ucbuf_ungetc(ESCAPE, buf);
441 
442     return ucbuf_getcx32(buf, status);
443 }
444 
isWhitespace(UChar32 c)445 static UBool isWhitespace(UChar32 c) {
446     switch (c) {
447         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
448     case 0x000A:
449     case 0x2029:
450         lineCount++;
451     case 0x000D:
452     case 0x0020:
453     case 0x0009:
454     case 0xFEFF:
455         return TRUE;
456 
457     default:
458         return FALSE;
459     }
460 }
461 
isNewline(UChar32 c)462 static UBool isNewline(UChar32 c) {
463     switch (c) {
464         /* '\n', '\r', 0x2029 */
465     case 0x000A:
466     case 0x2029:
467         lineCount++;
468     case 0x000D:
469         return TRUE;
470 
471     default:
472         return FALSE;
473     }
474 }
475