1 package java_cup; 2 3 import java.util.Hashtable; 4 5 import java_cup.runtime.str_token; 6 import java_cup.runtime.token; 7 8 /** This class implements a small scanner (aka lexical analyzer or lexer) for 9 * the JavaCup specification. This scanner reads characters from standard 10 * input (System.in) and returns integers corresponding to the terminal 11 * number of the next token. Once end of input is reached the EOF token is 12 * returned on every subsequent call.<p> 13 * Tokens currently returned include: <pre> 14 * Symbol Constant Returned Symbol Constant Returned 15 * ------ ----------------- ------ ----------------- 16 * "package" PACKAGE "import" IMPORT 17 * "code" CODE "action" ACTION 18 * "parser" PARSER "terminal" TERMINAL 19 * "non" NON "init" INIT 20 * "scan" SCAN "with" WITH 21 * "start" START ; SEMI 22 * , COMMA * STAR 23 * . DOT : COLON 24 * ::= COLON_COLON_EQUALS | BAR 25 * identifier ID {:...:} CODE_STRING 26 * "debug" DEBUG 27 * </pre> 28 * All symbol constants are defined in sym.java which is generated by 29 * JavaCup from parser.cup.<p> 30 * 31 * In addition to the scanner proper (called first via init() then with 32 * next_token() to get each token) this class provides simple error and 33 * warning routines and keeps a count of errors and warnings that is 34 * publicly accessible.<p> 35 * 36 * This class is "static" (i.e., it has only static members and methods). 37 * 38 * @version last updated: 11/25/95 39 * @author Scott Hudson 40 */ 41 public class lexer { 42 43 /*-----------------------------------------------------------*/ 44 /*--- Constructor(s) ----------------------------------------*/ 45 /*-----------------------------------------------------------*/ 46 47 /** The only constructor is private, so no instances can be created. */ lexer()48 private lexer() { } 49 50 /*-----------------------------------------------------------*/ 51 /*--- Static (Class) Variables ------------------------------*/ 52 /*-----------------------------------------------------------*/ 53 54 /** First character of lookahead. */ 55 protected static int next_char; 56 57 /** Second character of lookahead. */ 58 protected static int next_char2; 59 60 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 61 62 /** EOF constant. */ 63 protected static final int EOF_CHAR = -1; 64 65 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 66 67 /** Table of keywords. Keywords are initially treated as identifiers. 68 * Just before they are returned we look them up in this table to see if 69 * they match one of the keywords. The string of the name is the key here, 70 * which indexes Integer objects holding the symbol number. 71 */ 72 protected static Hashtable keywords = new Hashtable(23); 73 74 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 75 76 /** Table of single character symbols. For ease of implementation, we 77 * store all unambiguous single character tokens in this table of Integer 78 * objects keyed by Integer objects with the numerical value of the 79 * appropriate char (currently Character objects have a bug which precludes 80 * their use in tables). 81 */ 82 protected static Hashtable char_symbols = new Hashtable(11); 83 84 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 85 86 /** Current line number for use in error messages. */ 87 protected static int current_line = 1; 88 89 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 90 91 /** Character position in current line. */ 92 protected static int current_position = 1; 93 94 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 95 96 /** Count of total errors detected so far. */ 97 public static int error_count = 0; 98 99 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 100 101 /** Count of warnings issued so far */ 102 public static int warning_count = 0; 103 104 /*-----------------------------------------------------------*/ 105 /*--- Static Methods ----------------------------------------*/ 106 /*-----------------------------------------------------------*/ 107 108 /** Initialize the scanner. This sets up the keywords and char_symbols 109 * tables and reads the first two characters of lookahead. 110 */ init()111 public static void init() throws java.io.IOException 112 { 113 /* set up the keyword table */ 114 keywords.put("package", new Integer(sym.PACKAGE)); 115 keywords.put("import", new Integer(sym.IMPORT)); 116 keywords.put("code", new Integer(sym.CODE)); 117 keywords.put("action", new Integer(sym.ACTION)); 118 keywords.put("parser", new Integer(sym.PARSER)); 119 keywords.put("terminal", new Integer(sym.TERMINAL)); 120 keywords.put("non", new Integer(sym.NON)); 121 keywords.put("init", new Integer(sym.INIT)); 122 keywords.put("scan", new Integer(sym.SCAN)); 123 keywords.put("with", new Integer(sym.WITH)); 124 keywords.put("start", new Integer(sym.START)); 125 keywords.put("debug", new Integer(sym.DEBUG)); 126 127 /* set up the table of single character symbols */ 128 char_symbols.put(new Integer(';'), new Integer(sym.SEMI)); 129 char_symbols.put(new Integer(','), new Integer(sym.COMMA)); 130 char_symbols.put(new Integer('*'), new Integer(sym.STAR)); 131 char_symbols.put(new Integer('.'), new Integer(sym.DOT)); 132 char_symbols.put(new Integer('|'), new Integer(sym.BAR)); 133 134 /* read two characters of lookahead */ 135 next_char = System.in.read(); 136 if (next_char == EOF_CHAR) 137 next_char2 = EOF_CHAR; 138 else 139 next_char2 = System.in.read(); 140 } 141 142 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 143 144 /** Advance the scanner one character in the input stream. This moves 145 * next_char2 to next_char and then reads a new next_char2. 146 */ advance()147 protected static void advance() throws java.io.IOException 148 { 149 int old_char; 150 151 old_char = next_char; 152 next_char = next_char2; 153 if (next_char == EOF_CHAR) 154 next_char2 = EOF_CHAR; 155 else 156 next_char2 = System.in.read(); 157 158 /* count this */ 159 current_position++; 160 if (old_char == '\n') 161 { 162 current_line++; 163 current_position = 1; 164 } 165 } 166 167 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 168 169 /** Emit an error message. The message will be marked with both the 170 * current line number and the position in the line. Error messages 171 * are printed on standard error (System.err). 172 * @param message the message to print. 173 */ emit_error(String message)174 public static void emit_error(String message) 175 { 176 System.err.println("Error at " + current_line + "(" + current_position + 177 "): " + message); 178 error_count++; 179 } 180 181 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 182 183 /** Emit a warning message. The message will be marked with both the 184 * current line number and the position in the line. Messages are 185 * printed on standard error (System.err). 186 * @param message the message to print. 187 */ emit_warn(String message)188 public static void emit_warn(String message) 189 { 190 System.err.println("Warning at " + current_line + "(" + current_position + 191 "): " + message); 192 warning_count++; 193 } 194 195 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 196 197 /** Determine if a character is ok to start an id. 198 * @param ch the character in question. 199 */ id_start_char(int ch)200 protected static boolean id_start_char(int ch) 201 { 202 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || 203 (ch == '_'); 204 205 // later need to deal with non-8-bit chars here 206 } 207 208 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 209 210 /** Determine if a character is ok for the middle of an id. 211 * @param ch the character in question. 212 */ id_char(int ch)213 protected static boolean id_char(int ch) 214 { 215 return id_start_char(ch) || (ch >= '0' && ch <= '9'); 216 } 217 218 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 219 220 /** Try to look up a single character symbol, returns -1 for not found. 221 * @param ch the character in question. 222 */ find_single_char(int ch)223 protected static int find_single_char(int ch) 224 { 225 Integer result; 226 227 result = (Integer)char_symbols.get(new Integer((char)ch)); 228 if (result == null) 229 return -1; 230 else 231 return result.intValue(); 232 } 233 234 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 235 236 /** Handle swallowing up a comment. Both old style C and new style C++ 237 * comments are handled. 238 */ swallow_comment()239 protected static void swallow_comment() throws java.io.IOException 240 { 241 /* next_char == '/' at this point */ 242 243 /* is it a traditional comment */ 244 if (next_char2 == '*') 245 { 246 /* swallow the opener */ 247 advance(); advance(); 248 249 /* swallow the comment until end of comment or EOF */ 250 for (;;) 251 { 252 /* if its EOF we have an error */ 253 if (next_char == EOF_CHAR) 254 { 255 emit_error("Specification file ends inside a comment"); 256 return; 257 } 258 259 /* if we can see the closer we are done */ 260 if (next_char == '*' && next_char2 == '/') 261 { 262 advance(); 263 advance(); 264 return; 265 } 266 267 /* otherwise swallow char and move on */ 268 advance(); 269 } 270 } 271 272 /* is its a new style comment */ 273 if (next_char2 == '/') 274 { 275 /* swallow the opener */ 276 advance(); advance(); 277 278 /* swallow to '\n', '\f', or EOF */ 279 while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR) 280 advance(); 281 282 return; 283 284 } 285 286 /* shouldn't get here, but... if we get here we have an error */ 287 emit_error("Malformed comment in specification -- ignored"); 288 advance(); 289 } 290 291 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 292 293 /** Swallow up a code string. Code strings begin with "{:" and include 294 all characters up to the first occurrence of ":}" (there is no way to 295 include ":}" inside a code string). The routine returns an str_token 296 object suitable for return by the scanner. 297 */ do_code_string()298 protected static token do_code_string() throws java.io.IOException 299 { 300 StringBuffer result = new StringBuffer(); 301 302 /* at this point we have lookahead of "{:" -- swallow that */ 303 advance(); advance(); 304 305 /* save chars until we see ":}" */ 306 while (!(next_char == ':' && next_char2 == '}')) 307 { 308 /* if we have run off the end issue a message and break out of loop */ 309 if (next_char == EOF_CHAR) 310 { 311 emit_error("Specification file ends inside a code string"); 312 break; 313 } 314 315 /* otherwise record the char and move on */ 316 result.append(new Character((char)next_char)); 317 advance(); 318 } 319 320 /* advance past the closer and build a return token */ 321 advance(); advance(); 322 return new str_token(sym.CODE_STRING, result.toString()); 323 } 324 325 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 326 327 /** Process an identifier. Identifiers begin with a letter, underscore, 328 * or dollar sign, which is followed by zero or more letters, numbers, 329 * underscores or dollar signs. This routine returns an str_token suitable 330 * for return by the scanner. 331 */ do_id()332 protected static token do_id() throws java.io.IOException 333 { 334 StringBuffer result = new StringBuffer(); 335 String result_str; 336 Integer keyword_num; 337 char buffer[] = new char[1]; 338 339 /* next_char holds first character of id */ 340 buffer[0] = (char)next_char; 341 result.append(buffer,0,1); 342 advance(); 343 344 /* collect up characters while they fit in id */ 345 while(id_char(next_char)) 346 { 347 buffer[0] = (char)next_char; 348 result.append(buffer,0,1); 349 advance(); 350 } 351 352 /* extract a string and try to look it up as a keyword */ 353 result_str = result.toString(); 354 keyword_num = (Integer)keywords.get(result_str); 355 356 /* if we found something, return that keyword */ 357 if (keyword_num != null) 358 return new token(keyword_num.intValue()); 359 360 /* otherwise build and return an id token with an attached string */ 361 return new str_token(sym.ID, result_str); 362 } 363 364 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 365 366 /** Return one token. This is the main external interface to the scanner. 367 * It consumes sufficient characters to determine the next input token 368 * and returns it. To help with debugging, this routine actually calls 369 * real_next_token() which does the work. If you need to debug the 370 * parser, this can be changed to call debug_next_token() which prints 371 * a debugging message before returning the token. 372 */ next_token()373 public static token next_token() throws java.io.IOException 374 { 375 return real_next_token(); 376 } 377 378 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 379 380 /** Debugging version of next_token(). This routine calls the real scanning 381 * routine, prints a message on System.out indicating what the token is, 382 * then returns it. 383 */ debug_next_token()384 public static token debug_next_token() throws java.io.IOException 385 { 386 token result = real_next_token(); 387 System.out.println("# next_token() => " + result.sym); 388 return result; 389 } 390 391 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ 392 393 /** The actual routine to return one token. This is normally called from 394 * next_token(), but for debugging purposes can be called indirectly from 395 * debug_next_token(). 396 */ real_next_token()397 protected static token real_next_token() throws java.io.IOException 398 { 399 int sym_num; 400 401 for (;;) 402 { 403 /* look for white space */ 404 if (next_char == ' ' || next_char == '\t' || next_char == '\n' || 405 next_char == '\f' || next_char == '\r') 406 { 407 /* advance past it and try the next character */ 408 advance(); 409 continue; 410 } 411 412 /* look for a single character symbol */ 413 sym_num = find_single_char(next_char); 414 if (sym_num != -1) 415 { 416 /* found one -- advance past it and return a token for it */ 417 advance(); 418 return new token(sym_num); 419 } 420 421 /* look for : or ::= */ 422 if (next_char == ':') 423 { 424 /* if we don't have a second ':' return COLON */ 425 if (next_char2 != ':') 426 { 427 advance(); 428 return new token(sym.COLON); 429 } 430 431 /* move forward and look for the '=' */ 432 advance(); 433 if (next_char2 == '=') 434 { 435 advance(); advance(); 436 return new token(sym.COLON_COLON_EQUALS); 437 } 438 else 439 { 440 /* return just the colon (already consumed) */ 441 return new token(sym.COLON); 442 } 443 } 444 445 /* look for a comment */ 446 if (next_char == '/' && (next_char2 == '*' || next_char2 == '/')) 447 { 448 /* swallow then continue the scan */ 449 swallow_comment(); 450 continue; 451 } 452 453 /* look for start of code string */ 454 if (next_char == '{' && next_char2 == ':') 455 return do_code_string(); 456 457 /* look for an id or keyword */ 458 if (id_start_char(next_char)) return do_id(); 459 460 /* look for EOF */ 461 if (next_char == EOF_CHAR) return new token(sym.EOF); 462 463 /* if we get here, we have an unrecognized character */ 464 emit_warn("Unrecognized character '" + 465 new Character((char)next_char) + "'(" + next_char + 466 ") -- ignored"); 467 468 /* advance past it */ 469 advance(); 470 } 471 } 472 473 /*-----------------------------------------------------------*/ 474 }; 475 476