1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1998-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 *
9 * File read.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 05/26/99 stephen Creation.
15 * 5/10/01 Ram removed ustdio dependency
16 *******************************************************************************
17 */
18
19 #include "read.h"
20 #include "errmsg.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf16.h"
23
24 #define OPENBRACE 0x007B
25 #define CLOSEBRACE 0x007D
26 #define COMMA 0x002C
27 #define QUOTE 0x0022
28 #define ESCAPE 0x005C
29 #define SLASH 0x002F
30 #define ASTERISK 0x002A
31 #define SPACE 0x0020
32 #define COLON 0x003A
33 #define BADBOM 0xFFFE
34 #define CR 0x000D
35 #define LF 0x000A
36
37 static int32_t lineCount;
38
39 /* Protos */
40 static enum ETokenType getStringToken(UCHARBUF *buf,
41 UChar32 initialChar,
42 struct UString *token,
43 UErrorCode *status);
44
45 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
46 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
48 static UBool isWhitespace (UChar32 c);
49 static UBool isNewline (UChar32 c);
50
resetLineNumber()51 U_CFUNC void resetLineNumber() {
52 lineCount = 1;
53 }
54
55 /* Read and return the next token from the stream. If the token is of
56 type eString, fill in the token parameter with the token. If the
57 token is eError, then the status parameter will contain the
58 specific error. This will be eItemNotFound at the end of file,
59 indicating that all tokens have been returned. This method will
60 never return eString twice in a row; instead, multiple adjacent
61 string tokens will be merged into one, with no intervening
62 space. */
63 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)64 getNextToken(UCHARBUF* buf,
65 struct UString *token,
66 uint32_t *linenumber, /* out: linenumber of token */
67 struct UString *comment,
68 UErrorCode *status) {
69 enum ETokenType result;
70 UChar32 c;
71
72 if (U_FAILURE(*status)) {
73 return TOK_ERROR;
74 }
75
76 /* Skip whitespace */
77 c = getNextChar(buf, TRUE, comment, status);
78
79 if (U_FAILURE(*status)) {
80 return TOK_ERROR;
81 }
82
83 *linenumber = lineCount;
84
85 switch(c) {
86 case BADBOM:
87 return TOK_ERROR;
88 case OPENBRACE:
89 return TOK_OPEN_BRACE;
90 case CLOSEBRACE:
91 return TOK_CLOSE_BRACE;
92 case COMMA:
93 return TOK_COMMA;
94 case U_EOF:
95 return TOK_EOF;
96 case COLON:
97 return TOK_COLON;
98
99 default:
100 result = getStringToken(buf, c, token, status);
101 }
102
103 *linenumber = lineCount;
104 return result;
105 }
106
107 /* Copy a string token into the given UnicodeString. Upon entry, we
108 have already read the first character of the string token, which is
109 not a whitespace character (but may be a QUOTE or ESCAPE). This
110 function reads all subsequent characters that belong with this
111 string, and copy them into the token parameter. The other
112 important, and slightly convoluted purpose of this function is to
113 merge adjacent strings. It looks forward a bit, and if the next
114 non comment, non whitespace item is a string, it reads it in as
115 well. If two adjacent strings are quoted, they are merged without
116 intervening space. Otherwise a single SPACE character is
117 inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)118 static enum ETokenType getStringToken(UCHARBUF* buf,
119 UChar32 initialChar,
120 struct UString *token,
121 UErrorCode *status) {
122 UBool lastStringWasQuoted;
123 UChar32 c;
124 UChar target[3] = { '\0' };
125 UChar *pTarget = target;
126 int len=0;
127 UBool isFollowingCharEscaped=FALSE;
128 UBool isNLUnescaped = FALSE;
129 UChar32 prevC=0;
130
131 /* We are guaranteed on entry that initialChar is not a whitespace
132 character. If we are at the EOF, or have some other problem, it
133 doesn't matter; we still want to validly return the initialChar
134 (if nothing else) as a string token. */
135
136 if (U_FAILURE(*status)) {
137 return TOK_ERROR;
138 }
139
140 /* setup */
141 lastStringWasQuoted = FALSE;
142 c = initialChar;
143 ustr_setlen(token, 0, status);
144
145 if (U_FAILURE(*status)) {
146 return TOK_ERROR;
147 }
148
149 for (;;) {
150 if (c == QUOTE) {
151 if (!lastStringWasQuoted && token->fLength > 0) {
152 ustr_ucat(token, SPACE, status);
153
154 if (U_FAILURE(*status)) {
155 return TOK_ERROR;
156 }
157 }
158
159 lastStringWasQuoted = TRUE;
160
161 for (;;) {
162 c = ucbuf_getc(buf,status);
163
164 /* EOF reached */
165 if (c == U_EOF) {
166 return TOK_EOF;
167 }
168
169 /* Unterminated quoted strings */
170 if (U_FAILURE(*status)) {
171 return TOK_ERROR;
172 }
173
174 if (c == QUOTE && !isFollowingCharEscaped) {
175 break;
176 }
177
178 if (c == ESCAPE && !isFollowingCharEscaped) {
179 pTarget = target;
180 c = unescape(buf, status);
181
182 if (c == U_ERR) {
183 return TOK_ERROR;
184 }
185 if(c == CR || c == LF){
186 isNLUnescaped = TRUE;
187 }
188 }
189
190 if(c==ESCAPE && !isFollowingCharEscaped){
191 isFollowingCharEscaped = TRUE;
192 }else{
193 U_APPEND_CHAR32(c, pTarget,len);
194 pTarget = target;
195 ustr_uscat(token, pTarget,len, status);
196 isFollowingCharEscaped = FALSE;
197 len=0;
198 if(c == CR || c == LF){
199 if(isNLUnescaped == FALSE && prevC!=CR){
200 lineCount++;
201 }
202 isNLUnescaped = FALSE;
203 }
204 }
205
206 if (U_FAILURE(*status)) {
207 return TOK_ERROR;
208 }
209 prevC = c;
210 }
211 } else {
212 if (token->fLength > 0) {
213 ustr_ucat(token, SPACE, status);
214
215 if (U_FAILURE(*status)) {
216 return TOK_ERROR;
217 }
218 }
219
220 if(lastStringWasQuoted){
221 if(getShowWarning()){
222 warning(lineCount, "Mixing quoted and unquoted strings");
223 }
224 if(isStrict()){
225 return TOK_ERROR;
226 }
227
228 }
229
230 lastStringWasQuoted = FALSE;
231
232 /* if we reach here we are mixing
233 * quoted and unquoted strings
234 * warn in normal mode and error in
235 * pedantic mode
236 */
237
238 if (c == ESCAPE) {
239 pTarget = target;
240 c = unescape(buf, status);
241
242 /* EOF reached */
243 if (c == U_EOF) {
244 return TOK_ERROR;
245 }
246 }
247
248 U_APPEND_CHAR32(c, pTarget,len);
249 pTarget = target;
250 ustr_uscat(token, pTarget,len, status);
251 len=0;
252
253 if (U_FAILURE(*status)) {
254 return TOK_ERROR;
255 }
256
257 for (;;) {
258 /* DON'T skip whitespace */
259 c = getNextChar(buf, FALSE, NULL, status);
260
261 /* EOF reached */
262 if (c == U_EOF) {
263 ucbuf_ungetc(c, buf);
264 return TOK_STRING;
265 }
266
267 if (U_FAILURE(*status)) {
268 return TOK_STRING;
269 }
270
271 if (c == QUOTE
272 || c == OPENBRACE
273 || c == CLOSEBRACE
274 || c == COMMA
275 || c == COLON) {
276 ucbuf_ungetc(c, buf);
277 break;
278 }
279
280 if (isWhitespace(c)) {
281 break;
282 }
283
284 if (c == ESCAPE) {
285 pTarget = target;
286 c = unescape(buf, status);
287
288 if (c == U_ERR) {
289 return TOK_ERROR;
290 }
291 }
292
293 U_APPEND_CHAR32(c, pTarget,len);
294 pTarget = target;
295 ustr_uscat(token, pTarget,len, status);
296 len=0;
297 if (U_FAILURE(*status)) {
298 return TOK_ERROR;
299 }
300 }
301 }
302
303 /* DO skip whitespace */
304 c = getNextChar(buf, TRUE, NULL, status);
305
306 if (U_FAILURE(*status)) {
307 return TOK_STRING;
308 }
309
310 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
311 ucbuf_ungetc(c, buf);
312 return TOK_STRING;
313 }
314 }
315 }
316
317 /* Retrieve the next character. If skipwhite is
318 true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)319 static UChar32 getNextChar(UCHARBUF* buf,
320 UBool skipwhite,
321 struct UString *token,
322 UErrorCode *status) {
323 UChar32 c, c2;
324
325 if (U_FAILURE(*status)) {
326 return U_EOF;
327 }
328
329 for (;;) {
330 c = ucbuf_getc(buf,status);
331
332 if (c == U_EOF) {
333 return U_EOF;
334 }
335
336 if (skipwhite && isWhitespace(c)) {
337 continue;
338 }
339
340 /* This also handles the get() failing case */
341 if (c != SLASH) {
342 return c;
343 }
344
345 c = ucbuf_getc(buf,status); /* "/c" */
346
347 if (c == U_EOF) {
348 return U_EOF;
349 }
350
351 switch (c) {
352 case SLASH: /* "//" */
353 seekUntilNewline(buf, NULL, status);
354 break;
355
356 case ASTERISK: /* " / * " */
357 c2 = ucbuf_getc(buf, status); /* "/ * c" */
358 if(c2 == ASTERISK){ /* "/ * *" */
359 /* parse multi-line comment and store it in token*/
360 seekUntilEndOfComment(buf, token, status);
361 } else {
362 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
363 seekUntilEndOfComment(buf, NULL, status);
364 }
365 break;
366
367 default:
368 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
369 /* If get() failed this is a NOP */
370 return SLASH;
371 }
372
373 }
374 }
375
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)376 static void seekUntilNewline(UCHARBUF* buf,
377 struct UString *token,
378 UErrorCode *status) {
379 UChar32 c;
380
381 if (U_FAILURE(*status)) {
382 return;
383 }
384
385 do {
386 c = ucbuf_getc(buf,status);
387 /* add the char to token */
388 if(token!=NULL){
389 ustr_u32cat(token, c, status);
390 }
391 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
392 }
393
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)394 static void seekUntilEndOfComment(UCHARBUF *buf,
395 struct UString *token,
396 UErrorCode *status) {
397 UChar32 c, d;
398 uint32_t line;
399
400 if (U_FAILURE(*status)) {
401 return;
402 }
403
404 line = lineCount;
405
406 do {
407 c = ucbuf_getc(buf, status);
408
409 if (c == ASTERISK) {
410 d = ucbuf_getc(buf, status);
411
412 if (d != SLASH) {
413 ucbuf_ungetc(d, buf);
414 } else {
415 break;
416 }
417 }
418 /* add the char to token */
419 if(token!=NULL){
420 ustr_u32cat(token, c, status);
421 }
422 /* increment the lineCount */
423 isNewline(c);
424
425 } while (c != U_EOF && *status == U_ZERO_ERROR);
426
427 if (c == U_EOF) {
428 *status = U_INVALID_FORMAT_ERROR;
429 error(line, "unterminated comment detected");
430 }
431 }
432
unescape(UCHARBUF * buf,UErrorCode * status)433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
434 if (U_FAILURE(*status)) {
435 return U_EOF;
436 }
437
438 /* We expect to be called after the ESCAPE has been seen, but
439 * u_fgetcx needs an ESCAPE to do its magic. */
440 ucbuf_ungetc(ESCAPE, buf);
441
442 return ucbuf_getcx32(buf, status);
443 }
444
isWhitespace(UChar32 c)445 static UBool isWhitespace(UChar32 c) {
446 switch (c) {
447 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
448 case 0x000A:
449 case 0x2029:
450 lineCount++;
451 case 0x000D:
452 case 0x0020:
453 case 0x0009:
454 case 0xFEFF:
455 return TRUE;
456
457 default:
458 return FALSE;
459 }
460 }
461
isNewline(UChar32 c)462 static UBool isNewline(UChar32 c) {
463 switch (c) {
464 /* '\n', '\r', 0x2029 */
465 case 0x000A:
466 case 0x2029:
467 lineCount++;
468 case 0x000D:
469 return TRUE;
470
471 default:
472 return FALSE;
473 }
474 }
475