1 /*
2  * [The "BSD licence"]
3  * Copyright (c) 2005-2008 Terence Parr
4  * All rights reserved.
5  *
6  * Conversion to C#:
7  * Copyright (c) 2008-2009 Sam Harwell, Pixel Mine, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 namespace Antlr.Runtime
34 {
35     using ConditionalAttribute = System.Diagnostics.ConditionalAttribute;
36 
37     /** <summary>
38      *  A lexer is recognizer that draws input symbols from a character stream.
39      *  lexer grammars result in a subclass of this object. A Lexer object
40      *  uses simplified match() and error recovery mechanisms in the interest
41      *  of speed.
42      *  </summary>
43      */
44     public abstract class Lexer : BaseRecognizer, ITokenSource
45     {
46         /** <summary>Where is the lexer drawing characters from?</summary> */
47         protected ICharStream input;
48 
Lexer()49         public Lexer()
50         {
51         }
52 
Lexer( ICharStream input )53         public Lexer( ICharStream input )
54         {
55             this.input = input;
56         }
57 
Lexer( ICharStream input, RecognizerSharedState state )58         public Lexer( ICharStream input, RecognizerSharedState state )
59             : base(state)
60         {
61             this.input = input;
62         }
63 
64         #region Properties
65         public string Text
66         {
67             /** <summary>Return the text matched so far for the current token or any text override.</summary> */
68             get
69             {
70                 if ( state.text != null )
71                 {
72                     return state.text;
73                 }
74                 return input.Substring( state.tokenStartCharIndex, CharIndex - state.tokenStartCharIndex );
75             }
76             /** <summary>Set the complete text of this token; it wipes any previous changes to the text.</summary> */
77             set
78             {
79                 state.text = value;
80             }
81         }
82         public int Line
83         {
84             get
85             {
86                 return input.Line;
87             }
88             set
89             {
90                 input.Line = value;
91             }
92         }
93         public int CharPositionInLine
94         {
95             get
96             {
97                 return input.CharPositionInLine;
98             }
99             set
100             {
101                 input.CharPositionInLine = value;
102             }
103         }
104         #endregion
105 
Reset()106         public override void Reset()
107         {
108             base.Reset(); // reset all recognizer state variables
109             // wack Lexer state variables
110             if ( input != null )
111             {
112                 input.Seek( 0 ); // rewind the input
113             }
114             if ( state == null )
115             {
116                 return; // no shared state work to do
117             }
118             state.token = null;
119             state.type = TokenTypes.Invalid;
120             state.channel = TokenChannels.Default;
121             state.tokenStartCharIndex = -1;
122             state.tokenStartCharPositionInLine = -1;
123             state.tokenStartLine = -1;
124             state.text = null;
125         }
126 
127         /** <summary>Return a token from this source; i.e., match a token on the char stream.</summary> */
NextToken()128         public virtual IToken NextToken()
129         {
130             for ( ; ; )
131             {
132                 state.token = null;
133                 state.channel = TokenChannels.Default;
134                 state.tokenStartCharIndex = input.Index;
135                 state.tokenStartCharPositionInLine = input.CharPositionInLine;
136                 state.tokenStartLine = input.Line;
137                 state.text = null;
138                 if ( input.LA( 1 ) == CharStreamConstants.EndOfFile )
139                 {
140                     return GetEndOfFileToken();
141                 }
142                 try
143                 {
144                     ParseNextToken();
145                     if ( state.token == null )
146                     {
147                         Emit();
148                     }
149                     else if ( state.token == Tokens.Skip )
150                     {
151                         continue;
152                     }
153                     return state.token;
154                 }
155                 catch (MismatchedRangeException mre)
156                 {
157                     ReportError(mre);
158                     // MatchRange() routine has already called recover()
159                 }
160                 catch (MismatchedTokenException mte)
161                 {
162                     ReportError(mte);
163                     // Match() routine has already called recover()
164                 }
165                 catch ( RecognitionException re )
166                 {
167                     ReportError( re );
168                     Recover( re ); // throw out current char and try again
169                 }
170             }
171         }
172 
173         /** Returns the EOF token (default), if you need
174          *  to return a custom token instead override this method.
175          */
GetEndOfFileToken()176         public virtual IToken GetEndOfFileToken()
177         {
178             IToken eof = new CommonToken((ICharStream)input, CharStreamConstants.EndOfFile, TokenChannels.Default, input.Index, input.Index);
179             eof.Line = Line;
180             eof.CharPositionInLine = CharPositionInLine;
181             return eof;
182         }
183 
184         /** <summary>
185          *  Instruct the lexer to skip creating a token for current lexer rule
186          *  and look for another token.  nextToken() knows to keep looking when
187          *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
188          *  if token==null at end of any token rule, it creates one for you
189          *  and emits it.
190          *  </summary>
191          */
Skip()192         public virtual void Skip()
193         {
194             state.token = Tokens.Skip;
195         }
196 
197         /** <summary>This is the lexer entry point that sets instance var 'token'</summary> */
mTokens()198         public abstract void mTokens();
199 
200         public virtual ICharStream CharStream
201         {
202             get
203             {
204                 return input;
205             }
206             /** <summary>Set the char stream and reset the lexer</summary> */
207             set
208             {
209                 input = null;
210                 Reset();
211                 input = value;
212             }
213         }
214 
215         public override string SourceName
216         {
217             get
218             {
219                 return input.SourceName;
220             }
221         }
222 
223         /** <summary>
224          *  Currently does not support multiple emits per nextToken invocation
225          *  for efficiency reasons.  Subclass and override this method and
226          *  nextToken (to push tokens into a list and pull from that list rather
227          *  than a single variable as this implementation does).
228          *  </summary>
229          */
Emit( IToken token )230         public virtual void Emit( IToken token )
231         {
232             state.token = token;
233         }
234 
235         /** <summary>
236          *  The standard method called to automatically emit a token at the
237          *  outermost lexical rule.  The token object should point into the
238          *  char buffer start..stop.  If there is a text override in 'text',
239          *  use that to set the token's text.  Override this method to emit
240          *  custom Token objects.
241          *  </summary>
242          *
243          *  <remarks>
244          *  If you are building trees, then you should also override
245          *  Parser or TreeParser.getMissingSymbol().
246          *  </remarks>
247          */
Emit()248         public virtual IToken Emit()
249         {
250             IToken t = new CommonToken( input, state.type, state.channel, state.tokenStartCharIndex, CharIndex - 1 );
251             t.Line = state.tokenStartLine;
252             t.Text = state.text;
253             t.CharPositionInLine = state.tokenStartCharPositionInLine;
254             Emit( t );
255             return t;
256         }
257 
Match( string s )258         public virtual void Match( string s )
259         {
260             int i = 0;
261             while ( i < s.Length )
262             {
263                 if ( input.LA( 1 ) != s[i] )
264                 {
265                     if ( state.backtracking > 0 )
266                     {
267                         state.failed = true;
268                         return;
269                     }
270                     MismatchedTokenException mte = new MismatchedTokenException(s[i], input, TokenNames);
271                     Recover( mte );
272                     throw mte;
273                 }
274                 i++;
275                 input.Consume();
276                 state.failed = false;
277             }
278         }
279 
MatchAny()280         public virtual void MatchAny()
281         {
282             input.Consume();
283         }
284 
Match( int c )285         public virtual void Match( int c )
286         {
287             if ( input.LA( 1 ) != c )
288             {
289                 if ( state.backtracking > 0 )
290                 {
291                     state.failed = true;
292                     return;
293                 }
294                 MismatchedTokenException mte = new MismatchedTokenException(c, input, TokenNames);
295                 Recover( mte );  // don't really recover; just consume in lexer
296                 throw mte;
297             }
298             input.Consume();
299             state.failed = false;
300         }
301 
MatchRange( int a, int b )302         public virtual void MatchRange( int a, int b )
303         {
304             if ( input.LA( 1 ) < a || input.LA( 1 ) > b )
305             {
306                 if ( state.backtracking > 0 )
307                 {
308                     state.failed = true;
309                     return;
310                 }
311                 MismatchedRangeException mre = new MismatchedRangeException(a, b, input);
312                 Recover( mre );
313                 throw mre;
314             }
315             input.Consume();
316             state.failed = false;
317         }
318 
319         /** <summary>What is the index of the current character of lookahead?</summary> */
320         public virtual int CharIndex
321         {
322             get
323             {
324                 return input.Index;
325             }
326         }
327 
ReportError( RecognitionException e )328         public override void ReportError( RecognitionException e )
329         {
330             /** TODO: not thought about recovery in lexer yet.
331              *
332             // if we've already reported an error and have not matched a token
333             // yet successfully, don't report any errors.
334             if ( errorRecovery ) {
335                 //System.err.print("[SPURIOUS] ");
336                 return;
337             }
338             errorRecovery = true;
339              */
340 
341             DisplayRecognitionError( this.TokenNames, e );
342         }
343 
GetErrorMessage( RecognitionException e, string[] tokenNames )344         public override string GetErrorMessage( RecognitionException e, string[] tokenNames )
345         {
346             string msg = null;
347             if ( e is MismatchedTokenException )
348             {
349                 MismatchedTokenException mte = (MismatchedTokenException)e;
350                 msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting " + GetCharErrorDisplay( mte.Expecting );
351             }
352             else if ( e is NoViableAltException )
353             {
354                 NoViableAltException nvae = (NoViableAltException)e;
355                 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
356                 // and "(decision="+nvae.decisionNumber+") and
357                 // "state "+nvae.stateNumber
358                 msg = "no viable alternative at character " + GetCharErrorDisplay( e.Character );
359             }
360             else if ( e is EarlyExitException )
361             {
362                 EarlyExitException eee = (EarlyExitException)e;
363                 // for development, can add "(decision="+eee.decisionNumber+")"
364                 msg = "required (...)+ loop did not match anything at character " + GetCharErrorDisplay( e.Character );
365             }
366             else if ( e is MismatchedNotSetException )
367             {
368                 MismatchedNotSetException mse = (MismatchedNotSetException)e;
369                 msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " + mse.Expecting;
370             }
371             else if ( e is MismatchedSetException )
372             {
373                 MismatchedSetException mse = (MismatchedSetException)e;
374                 msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " + mse.Expecting;
375             }
376             else if ( e is MismatchedRangeException )
377             {
378                 MismatchedRangeException mre = (MismatchedRangeException)e;
379                 msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " +
380                       GetCharErrorDisplay( mre.A ) + ".." + GetCharErrorDisplay( mre.B );
381             }
382             else
383             {
384                 msg = base.GetErrorMessage( e, tokenNames );
385             }
386             return msg;
387         }
388 
GetCharErrorDisplay( int c )389         public virtual string GetCharErrorDisplay( int c )
390         {
391             string s = ( (char)c ).ToString();
392             switch ( c )
393             {
394             case TokenTypes.EndOfFile:
395                 s = "<EOF>";
396                 break;
397             case '\n':
398                 s = "\\n";
399                 break;
400             case '\t':
401                 s = "\\t";
402                 break;
403             case '\r':
404                 s = "\\r";
405                 break;
406             }
407             return "'" + s + "'";
408         }
409 
410         /** <summary>
411          *  Lexers can normally match any char in it's vocabulary after matching
412          *  a token, so do the easy thing and just kill a character and hope
413          *  it all works out.  You can instead use the rule invocation stack
414          *  to do sophisticated error recovery if you are in a fragment rule.
415          *  </summary>
416          */
Recover( RecognitionException re )417         public virtual void Recover( RecognitionException re )
418         {
419             //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
420             //re.printStackTrace();
421             input.Consume();
422         }
423 
424         [Conditional("ANTLR_TRACE")]
TraceIn( string ruleName, int ruleIndex )425         public virtual void TraceIn( string ruleName, int ruleIndex )
426         {
427             string inputSymbol = ( (char)input.LT( 1 ) ) + " line=" + Line + ":" + CharPositionInLine;
428             base.TraceIn( ruleName, ruleIndex, inputSymbol );
429         }
430 
431         [Conditional("ANTLR_TRACE")]
TraceOut( string ruleName, int ruleIndex )432         public virtual void TraceOut( string ruleName, int ruleIndex )
433         {
434             string inputSymbol = ( (char)input.LT( 1 ) ) + " line=" + Line + ":" + CharPositionInLine;
435             base.TraceOut( ruleName, ruleIndex, inputSymbol );
436         }
437 
ParseNextToken()438         protected virtual void ParseNextToken()
439         {
440             mTokens();
441         }
442     }
443 }
444