1 package java_cup;
2 
3 import java.util.Hashtable;
4 
5 import java_cup.runtime.str_token;
6 import java_cup.runtime.token;
7 
8 /** This class implements a small scanner (aka lexical analyzer or lexer) for
9  *  the JavaCup specification.  This scanner reads characters from standard
10  *  input (System.in) and returns integers corresponding to the terminal
11  *  number of the next token.  Once end of input is reached the EOF token is
12  *  returned on every subsequent call.<p>
13  *  Tokens currently returned include: <pre>
14  *    Symbol        Constant Returned     Symbol        Constant Returned
15  *    ------        -----------------     ------        -----------------
16  *    "package"     PACKAGE               "import"      IMPORT
17  *    "code"        CODE                  "action"      ACTION
18  *    "parser"      PARSER                "terminal"    TERMINAL
19  *    "non"         NON                   "init"        INIT
20  *    "scan"        SCAN                  "with"        WITH
21  *    "start"       START                   ;           SEMI
22  *      ,           COMMA                   *           STAR
23  *      .           DOT                     :           COLON
24  *      ::=         COLON_COLON_EQUALS      |           BAR
25  *    identifier    ID                    {:...:}       CODE_STRING
26  *    "debug"       DEBUG
27  *  </pre>
28  *  All symbol constants are defined in sym.java which is generated by
29  *  JavaCup from parser.cup.<p>
30  *
31  *  In addition to the scanner proper (called first via init() then with
32  *  next_token() to get each token) this class provides simple error and
33  *  warning routines and keeps a count of errors and warnings that is
34  *  publicly accessible.<p>
35  *
36  *  This class is "static" (i.e., it has only static members and methods).
37  *
38  * @version last updated: 11/25/95
39  * @author  Scott Hudson
40  */
41 public class lexer {
42 
43   /*-----------------------------------------------------------*/
44   /*--- Constructor(s) ----------------------------------------*/
45   /*-----------------------------------------------------------*/
46 
47   /** The only constructor is private, so no instances can be created. */
lexer()48   private lexer() { }
49 
50   /*-----------------------------------------------------------*/
51   /*--- Static (Class) Variables ------------------------------*/
52   /*-----------------------------------------------------------*/
53 
54   /** First character of lookahead. */
55   protected static int next_char;
56 
57   /** Second character of lookahead. */
58   protected static int next_char2;
59 
60   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
61 
62   /** EOF constant. */
63   protected static final int EOF_CHAR = -1;
64 
65   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
66 
67   /** Table of keywords.  Keywords are initially treated as identifiers.
68    *  Just before they are returned we look them up in this table to see if
69    *  they match one of the keywords.  The string of the name is the key here,
70    *  which indexes Integer objects holding the symbol number.
71    */
72   protected static Hashtable keywords = new Hashtable(23);
73 
74   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
75 
76   /** Table of single character symbols.  For ease of implementation, we
77    *  store all unambiguous single character tokens in this table of Integer
78    *  objects keyed by Integer objects with the numerical value of the
79    *  appropriate char (currently Character objects have a bug which precludes
80    *  their use in tables).
81    */
82   protected static Hashtable char_symbols = new Hashtable(11);
83 
84   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
85 
86   /** Current line number for use in error messages. */
87   protected static int current_line = 1;
88 
89   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
90 
91   /** Character position in current line. */
92   protected static int current_position = 1;
93 
94   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
95 
96   /** Count of total errors detected so far. */
97   public static int error_count = 0;
98 
99   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
100 
101   /** Count of warnings issued so far */
102   public static int warning_count = 0;
103 
104   /*-----------------------------------------------------------*/
105   /*--- Static Methods ----------------------------------------*/
106   /*-----------------------------------------------------------*/
107 
108   /** Initialize the scanner.  This sets up the keywords and char_symbols
109     * tables and reads the first two characters of lookahead.
110     */
init()111   public static void init() throws java.io.IOException
112     {
113       /* set up the keyword table */
114       keywords.put("package",  new Integer(sym.PACKAGE));
115       keywords.put("import",   new Integer(sym.IMPORT));
116       keywords.put("code",     new Integer(sym.CODE));
117       keywords.put("action",   new Integer(sym.ACTION));
118       keywords.put("parser",   new Integer(sym.PARSER));
119       keywords.put("terminal", new Integer(sym.TERMINAL));
120       keywords.put("non",      new Integer(sym.NON));
121       keywords.put("init",     new Integer(sym.INIT));
122       keywords.put("scan",     new Integer(sym.SCAN));
123       keywords.put("with",     new Integer(sym.WITH));
124       keywords.put("start",    new Integer(sym.START));
125       keywords.put("debug",    new Integer(sym.DEBUG));
126 
127       /* set up the table of single character symbols */
128       char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
129       char_symbols.put(new Integer(','), new Integer(sym.COMMA));
130       char_symbols.put(new Integer('*'), new Integer(sym.STAR));
131       char_symbols.put(new Integer('.'), new Integer(sym.DOT));
132       char_symbols.put(new Integer('|'), new Integer(sym.BAR));
133 
134       /* read two characters of lookahead */
135       next_char = System.in.read();
136       if (next_char == EOF_CHAR)
137     next_char2 = EOF_CHAR;
138       else
139     next_char2 = System.in.read();
140     }
141 
142   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
143 
144   /** Advance the scanner one character in the input stream.  This moves
145    * next_char2 to next_char and then reads a new next_char2.
146    */
advance()147   protected static void advance() throws java.io.IOException
148     {
149       int old_char;
150 
151       old_char = next_char;
152       next_char = next_char2;
153       if (next_char == EOF_CHAR)
154     next_char2 = EOF_CHAR;
155       else
156     next_char2 = System.in.read();
157 
158       /* count this */
159       current_position++;
160       if (old_char == '\n')
161     {
162       current_line++;
163       current_position = 1;
164     }
165     }
166 
167   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
168 
169   /** Emit an error message.  The message will be marked with both the
170    *  current line number and the position in the line.  Error messages
171    *  are printed on standard error (System.err).
172    * @param message the message to print.
173    */
emit_error(String message)174   public static void emit_error(String message)
175     {
176       System.err.println("Error at " + current_line + "(" + current_position +
177              "): " + message);
178       error_count++;
179     }
180 
181   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
182 
183   /** Emit a warning message.  The message will be marked with both the
184    *  current line number and the position in the line.  Messages are
185    *  printed on standard error (System.err).
186    * @param message the message to print.
187    */
emit_warn(String message)188   public static void emit_warn(String message)
189     {
190       System.err.println("Warning at " + current_line + "(" + current_position +
191              "): " + message);
192       warning_count++;
193     }
194 
195   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
196 
197   /** Determine if a character is ok to start an id.
198    * @param ch the character in question.
199    */
id_start_char(int ch)200   protected static boolean id_start_char(int ch)
201     {
202       return (ch >= 'a' &&  ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
203          (ch == '_');
204 
205       // later need to deal with non-8-bit chars here
206     }
207 
208   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
209 
210   /** Determine if a character is ok for the middle of an id.
211    * @param ch the character in question.
212    */
id_char(int ch)213   protected static boolean id_char(int ch)
214     {
215       return id_start_char(ch) || (ch >= '0' && ch <= '9');
216     }
217 
218   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
219 
220   /** Try to look up a single character symbol, returns -1 for not found.
221    * @param ch the character in question.
222    */
find_single_char(int ch)223   protected static int find_single_char(int ch)
224     {
225       Integer result;
226 
227       result = (Integer)char_symbols.get(new Integer((char)ch));
228       if (result == null)
229     return -1;
230       else
231     return result.intValue();
232     }
233 
234   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
235 
236   /** Handle swallowing up a comment.  Both old style C and new style C++
237    *  comments are handled.
238    */
swallow_comment()239   protected static void swallow_comment() throws java.io.IOException
240     {
241       /* next_char == '/' at this point */
242 
243       /* is it a traditional comment */
244       if (next_char2 == '*')
245     {
246       /* swallow the opener */
247       advance(); advance();
248 
249       /* swallow the comment until end of comment or EOF */
250       for (;;)
251         {
252           /* if its EOF we have an error */
253           if (next_char == EOF_CHAR)
254         {
255           emit_error("Specification file ends inside a comment");
256           return;
257         }
258 
259           /* if we can see the closer we are done */
260           if (next_char == '*' && next_char2 == '/')
261         {
262           advance();
263           advance();
264           return;
265         }
266 
267           /* otherwise swallow char and move on */
268           advance();
269         }
270     }
271 
272       /* is its a new style comment */
273       if (next_char2 == '/')
274     {
275       /* swallow the opener */
276       advance(); advance();
277 
278       /* swallow to '\n', '\f', or EOF */
279       while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
280         advance();
281 
282       return;
283 
284     }
285 
286       /* shouldn't get here, but... if we get here we have an error */
287       emit_error("Malformed comment in specification -- ignored");
288       advance();
289     }
290 
291   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
292 
293   /** Swallow up a code string.  Code strings begin with "{:" and include
294       all characters up to the first occurrence of ":}" (there is no way to
295       include ":}" inside a code string).  The routine returns an str_token
296       object suitable for return by the scanner.
297    */
do_code_string()298   protected static token do_code_string() throws java.io.IOException
299     {
300       StringBuffer result = new StringBuffer();
301 
302       /* at this point we have lookahead of "{:" -- swallow that */
303       advance(); advance();
304 
305       /* save chars until we see ":}" */
306       while (!(next_char == ':' && next_char2 == '}'))
307     {
308       /* if we have run off the end issue a message and break out of loop */
309       if (next_char == EOF_CHAR)
310         {
311           emit_error("Specification file ends inside a code string");
312           break;
313         }
314 
315       /* otherwise record the char and move on */
316       result.append(new Character((char)next_char));
317       advance();
318     }
319 
320       /* advance past the closer and build a return token */
321       advance(); advance();
322       return new str_token(sym.CODE_STRING, result.toString());
323     }
324 
325   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
326 
327   /** Process an identifier.  Identifiers begin with a letter, underscore,
328    *  or dollar sign, which is followed by zero or more letters, numbers,
329    *  underscores or dollar signs.  This routine returns an str_token suitable
330    *  for return by the scanner.
331    */
do_id()332   protected static token do_id() throws java.io.IOException
333     {
334       StringBuffer result = new StringBuffer();
335       String       result_str;
336       Integer      keyword_num;
337       char         buffer[] = new char[1];
338 
339       /* next_char holds first character of id */
340       buffer[0] = (char)next_char;
341       result.append(buffer,0,1);
342       advance();
343 
344       /* collect up characters while they fit in id */
345       while(id_char(next_char))
346     {
347           buffer[0] = (char)next_char;
348       result.append(buffer,0,1);
349       advance();
350     }
351 
352       /* extract a string and try to look it up as a keyword */
353       result_str = result.toString();
354       keyword_num = (Integer)keywords.get(result_str);
355 
356       /* if we found something, return that keyword */
357       if (keyword_num != null)
358     return new token(keyword_num.intValue());
359 
360       /* otherwise build and return an id token with an attached string */
361       return new str_token(sym.ID, result_str);
362     }
363 
364   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
365 
366   /** Return one token.  This is the main external interface to the scanner.
367    *  It consumes sufficient characters to determine the next input token
368    *  and returns it.  To help with debugging, this routine actually calls
369    *  real_next_token() which does the work.  If you need to debug the
370    *  parser, this can be changed to call debug_next_token() which prints
371    *  a debugging message before returning the token.
372    */
next_token()373   public static token next_token() throws java.io.IOException
374     {
375       return real_next_token();
376     }
377 
378   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
379 
380   /** Debugging version of next_token().  This routine calls the real scanning
381    *  routine, prints a message on System.out indicating what the token is,
382    *  then returns it.
383    */
debug_next_token()384   public static token debug_next_token() throws java.io.IOException
385     {
386       token result = real_next_token();
387       System.out.println("# next_token() => " + result.sym);
388       return result;
389     }
390 
391   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
392 
393   /** The actual routine to return one token.  This is normally called from
394    *  next_token(), but for debugging purposes can be called indirectly from
395    *  debug_next_token().
396    */
real_next_token()397   protected static token real_next_token() throws java.io.IOException
398     {
399       int sym_num;
400 
401       for (;;)
402     {
403       /* look for white space */
404       if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
405           next_char == '\f' ||  next_char == '\r')
406         {
407           /* advance past it and try the next character */
408           advance();
409           continue;
410         }
411 
412       /* look for a single character symbol */
413       sym_num = find_single_char(next_char);
414       if (sym_num != -1)
415         {
416           /* found one -- advance past it and return a token for it */
417           advance();
418           return new token(sym_num);
419         }
420 
421       /* look for : or ::= */
422       if (next_char == ':')
423         {
424           /* if we don't have a second ':' return COLON */
425           if (next_char2 != ':')
426         {
427           advance();
428           return new token(sym.COLON);
429         }
430 
431           /* move forward and look for the '=' */
432           advance();
433           if (next_char2 == '=')
434         {
435           advance(); advance();
436           return new token(sym.COLON_COLON_EQUALS);
437         }
438           else
439         {
440           /* return just the colon (already consumed) */
441           return new token(sym.COLON);
442         }
443         }
444 
445       /* look for a comment */
446       if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
447         {
448           /* swallow then continue the scan */
449           swallow_comment();
450           continue;
451         }
452 
453       /* look for start of code string */
454       if (next_char == '{' && next_char2 == ':')
455         return do_code_string();
456 
457       /* look for an id or keyword */
458       if (id_start_char(next_char)) return do_id();
459 
460       /* look for EOF */
461       if (next_char == EOF_CHAR) return new token(sym.EOF);
462 
463       /* if we get here, we have an unrecognized character */
464       emit_warn("Unrecognized character '" +
465         new Character((char)next_char) + "'(" + next_char +
466         ") -- ignored");
467 
468       /* advance past it */
469       advance();
470     }
471     }
472 
473   /*-----------------------------------------------------------*/
474 };
475 
476