1 /*
2  * [The "BSD license"]
3  *  Copyright (c) 2010 Terence Parr
4  *  All rights reserved.
5  *
6  *  Redistribution and use in source and binary forms, with or without
7  *  modification, are permitted provided that the following conditions
8  *  are met:
9  *  1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *  2. Redistributions in binary form must reproduce the above copyright
12  *      notice, this list of conditions and the following disclaimer in the
13  *      documentation and/or other materials provided with the distribution.
14  *  3. The name of the author may not be used to endorse or promote products
15  *      derived from this software without specific prior written permission.
16  *
17  *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 package org.antlr.codegen;
29 
30 import org.antlr.Tool;
31 import org.antlr.analysis.Label;
32 import org.antlr.runtime.Token;
33 import org.stringtemplate.v4.ST;
34 import org.antlr.tool.Grammar;
35 
36 import java.io.IOException;
37 import java.util.List;
38 
39 /** The code generator for ANTLR can usually be retargeted just by providing
40  *  a new X.stg file for language X, however, sometimes the files that must
41  *  be generated vary enough that some X-specific functionality is required.
42  *  For example, in C, you must generate header files whereas in Java you do not.
43  *  Other languages may want to keep DFA separate from the main
44  *  generated recognizer file.
45  *
46  *  The notion of a Code Generator target abstracts out the creation
47  *  of the various files.  As new language targets get added to the ANTLR
48  *  system, this target class may have to be altered to handle more
49  *  functionality.  Eventually, just about all language generation issues
50  *  will be expressible in terms of these methods.
51  *
52  *  If org.antlr.codegen.XTarget class exists, it is used else
53  *  Target base class is used.  I am using a superclass rather than an
54  *  interface for this target concept because I can add functionality
55  *  later without breaking previously written targets (extra interface
56  *  methods would force adding dummy functions to all code generator
57  *  target classes).
58  *
59  */
60 public class Target {
61 
62 	/** For pure strings of Java 16-bit unicode char, how can we display
63 	 *  it in the target language as a literal.  Useful for dumping
64 	 *  predicates and such that may refer to chars that need to be escaped
65 	 *  when represented as strings.  Also, templates need to be escaped so
66 	 *  that the target language can hold them as a string.
67 	 *
68 	 *  I have defined (via the constructor) the set of typical escapes,
69 	 *  but your Target subclass is free to alter the translated chars or
70 	 *  add more definitions.  This is nonstatic so each target can have
71 	 *  a different set in memory at same time.
72 	 */
73 	protected String[] targetCharValueEscape = new String[255];
74 
Target()75 	public Target() {
76 		targetCharValueEscape['\n'] = "\\n";
77 		targetCharValueEscape['\r'] = "\\r";
78 		targetCharValueEscape['\t'] = "\\t";
79 		targetCharValueEscape['\b'] = "\\b";
80 		targetCharValueEscape['\f'] = "\\f";
81 		targetCharValueEscape['\\'] = "\\\\";
82 		targetCharValueEscape['\''] = "\\'";
83 		targetCharValueEscape['"'] = "\\\"";
84 	}
85 
useBaseTemplatesForSynPredFragments()86     public boolean useBaseTemplatesForSynPredFragments() {
87         return true;
88     }
89 
genRecognizerFile(Tool tool, CodeGenerator generator, Grammar grammar, ST outputFileST)90 	protected void genRecognizerFile(Tool tool,
91 									 CodeGenerator generator,
92 									 Grammar grammar,
93 									 ST outputFileST)
94 		throws IOException
95 	{
96 		String fileName =
97 			generator.getRecognizerFileName(grammar.name, grammar.type);
98 		generator.write(outputFileST, fileName);
99 	}
100 
genRecognizerHeaderFile(Tool tool, CodeGenerator generator, Grammar grammar, ST headerFileST, String extName)101 	protected void genRecognizerHeaderFile(Tool tool,
102 										   CodeGenerator generator,
103 										   Grammar grammar,
104 										   ST headerFileST,
105 										   String extName) // e.g., ".h"
106 		throws IOException
107 	{
108 		// no header file by default
109 	}
110 
performGrammarAnalysis(CodeGenerator generator, Grammar grammar)111 	protected void performGrammarAnalysis(CodeGenerator generator,
112 										  Grammar grammar)
113 	{
114 		// Build NFAs from the grammar AST
115 		grammar.buildNFA();
116 
117 		// Create the DFA predictors for each decision
118 		grammar.createLookaheadDFAs();
119 	}
120 
121 	/** Is scope in @scope::name {action} valid for this kind of grammar?
122 	 *  Targets like C++ may want to allow new scopes like headerfile or
123 	 *  some such.  The action names themselves are not policed at the
124 	 *  moment so targets can add template actions w/o having to recompile
125 	 *  ANTLR.
126 	 */
isValidActionScope(int grammarType, String scope)127 	public boolean isValidActionScope(int grammarType, String scope) {
128 		switch (grammarType) {
129 			case Grammar.LEXER :
130 				if ( scope.equals("lexer") ) {return true;}
131 				break;
132 			case Grammar.PARSER :
133 				if ( scope.equals("parser") ) {return true;}
134 				break;
135 			case Grammar.COMBINED :
136 				if ( scope.equals("parser") ) {return true;}
137 				if ( scope.equals("lexer") ) {return true;}
138 				break;
139 			case Grammar.TREE_PARSER :
140 				if ( scope.equals("treeparser") ) {return true;}
141 				break;
142 		}
143 		return false;
144 	}
145 
146 	/** Target must be able to override the labels used for token types */
getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype)147 	public String getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype) {
148 		String name = generator.grammar.getTokenDisplayName(ttype);
149 		// If name is a literal, return the token type instead
150 		if ( name.charAt(0)=='\'' ) {
151 			return String.valueOf(ttype);
152 		}
153 		return name;
154 	}
155 
156 	/** Convert from an ANTLR char literal found in a grammar file to
157 	 *  an equivalent char literal in the target language.  For most
158 	 *  languages, this means leaving 'x' as 'x'.  Actually, we need
159 	 *  to escape '\u000A' so that it doesn't get converted to \n by
160 	 *  the compiler.  Convert the literal to the char value and then
161 	 *  to an appropriate target char literal.
162 	 *
163 	 *  Expect single quotes around the incoming literal.
164 	 */
getTargetCharLiteralFromANTLRCharLiteral( CodeGenerator generator, String literal)165 	public String getTargetCharLiteralFromANTLRCharLiteral(
166 		CodeGenerator generator,
167 		String literal)
168 	{
169 		StringBuilder buf = new StringBuilder();
170 		buf.append('\'');
171 		int c = Grammar.getCharValueFromGrammarCharLiteral(literal);
172 		if ( c<Label.MIN_CHAR_VALUE ) {
173 			return "'\u0000'";
174 		}
175 		if ( c<targetCharValueEscape.length &&
176 			 targetCharValueEscape[c]!=null )
177 		{
178 			buf.append(targetCharValueEscape[c]);
179 		}
180 		else if ( Character.UnicodeBlock.of((char)c)==
181 				  Character.UnicodeBlock.BASIC_LATIN &&
182 				  !Character.isISOControl((char)c) )
183 		{
184 			// normal char
185 			buf.append((char)c);
186 		}
187 		else {
188 			// must be something unprintable...use \\uXXXX
189 			// turn on the bit above max "\\uFFFF" value so that we pad with zeros
190 			// then only take last 4 digits
191 			String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
192 			buf.append("\\u");
193 			buf.append(hex);
194 		}
195 
196 		buf.append('\'');
197 		return buf.toString();
198 	}
199 
200 	/** Convert from an ANTLR string literal found in a grammar file to
201 	 *  an equivalent string literal in the target language.  For Java, this
202 	 *  is the translation 'a\n"' &rarr; "a\n\"".  Expect single quotes
203 	 *  around the incoming literal.  Just flip the quotes and replace
204 	 *  double quotes with \"
205      *
206      *  Note that we have decided to allow poeple to use '\"' without
207      *  penalty, so we must build the target string in a loop as Utils.replae
208      *  cannot handle both \" and " without a lot of messing around.
209      *
210 	 */
getTargetStringLiteralFromANTLRStringLiteral( CodeGenerator generator, String literal)211 	public String getTargetStringLiteralFromANTLRStringLiteral(
212 		CodeGenerator generator,
213 		String literal)
214 	{
215         StringBuilder sb = new StringBuilder();
216         StringBuilder is = new StringBuilder(literal);
217 
218         // Opening quote
219         //
220         sb.append('"');
221 
222         for (int i = 1; i < is.length() -1; i++) {
223             if  (is.charAt(i) == '\\') {
224                 // Anything escaped is what it is! We assume that
225                 // people know how to escape characters correctly. However
226                 // we catch anything that does not need an escape in Java (which
227                 // is what the default implementation is dealing with and remove
228                 // the escape. The C target does this for instance.
229                 //
230                 switch (is.charAt(i+1)) {
231                     // Pass through any escapes that Java also needs
232                     //
233                     case    '"':
234                     case    'n':
235                     case    'r':
236                     case    't':
237                     case    'b':
238                     case    'f':
239                     case    '\\':
240                     case    'u':    // Assume unnnn
241                         sb.append('\\');    // Pass the escape through
242                         break;
243                     default:
244                         // Remove the escape by virtue of not adding it here
245                         // Thus \' becomes ' and so on
246                         //
247                         break;
248                 }
249 
250                 // Go past the \ character
251                 //
252                 i++;
253             } else {
254                 // Chracters that don't need \ in ANTLR 'strings' but do in Java
255                 //
256                 if (is.charAt(i) == '"') {
257                     // We need to escape " in Java
258                     //
259                     sb.append('\\');
260                 }
261             }
262             // Add in the next character, which may have been escaped
263             //
264             sb.append(is.charAt(i));
265         }
266 
267         // Append closing " and return
268         //
269         sb.append('"');
270 
271 		return sb.toString();
272 	}
273 
274 	/** Given a random string of Java unicode chars, return a new string with
275 	 *  optionally appropriate quote characters for target language and possibly
276 	 *  with some escaped characters.  For example, if the incoming string has
277 	 *  actual newline characters, the output of this method would convert them
278 	 *  to the two char sequence \n for Java, C, C++, ...  The new string has
279 	 *  double-quotes around it as well.  Example String in memory:
280 	 *
281 	 *     a"[newlinechar]b'c[carriagereturnchar]d[tab]e\f
282 	 *
283 	 *  would be converted to the valid Java s:
284 	 *
285 	 *     "a\"\nb'c\rd\te\\f"
286 	 *
287 	 *  or
288 	 *
289 	 *     a\"\nb'c\rd\te\\f
290 	 *
291 	 *  depending on the quoted arg.
292 	 */
getTargetStringLiteralFromString(String s, boolean quoted)293 	public String getTargetStringLiteralFromString(String s, boolean quoted) {
294 		if ( s==null ) {
295 			return null;
296 		}
297 
298 		StringBuilder buf = new StringBuilder();
299 		if ( quoted ) {
300 			buf.append('"');
301 		}
302 		for (int i=0; i<s.length(); i++) {
303 			int c = s.charAt(i);
304 			if ( c!='\'' && // don't escape single quotes in strings for java
305 				 c<targetCharValueEscape.length &&
306 				 targetCharValueEscape[c]!=null )
307 			{
308 				buf.append(targetCharValueEscape[c]);
309 			}
310 			else {
311 				buf.append((char)c);
312 			}
313 		}
314 		if ( quoted ) {
315 			buf.append('"');
316 		}
317 		return buf.toString();
318 	}
319 
getTargetStringLiteralFromString(String s)320 	public String getTargetStringLiteralFromString(String s) {
321 		return getTargetStringLiteralFromString(s, false);
322 	}
323 
324 	/** Convert long to 0xNNNNNNNNNNNNNNNN by default for spitting out
325 	 *  with bitsets.  I.e., convert bytes to hex string.
326 	 */
getTarget64BitStringFromValue(long word)327 	public String getTarget64BitStringFromValue(long word) {
328 		int numHexDigits = 8*2;
329 		StringBuilder buf = new StringBuilder(numHexDigits+2);
330 		buf.append("0x");
331 		String digits = Long.toHexString(word);
332 		digits = digits.toUpperCase();
333 		int padding = numHexDigits - digits.length();
334 		// pad left with zeros
335 		for (int i=1; i<=padding; i++) {
336 			buf.append('0');
337 		}
338 		buf.append(digits);
339 		return buf.toString();
340 	}
341 
encodeIntAsCharEscape(int v)342 	public String encodeIntAsCharEscape(int v) {
343 		if ( v<=127 ) {
344 			return "\\"+Integer.toOctalString(v);
345 		}
346 		String hex = Integer.toHexString(v|0x10000).substring(1,5);
347 		return "\\u"+hex;
348 	}
349 
350 	/** Some targets only support ASCII or 8-bit chars/strings.  For example,
351 	 *  C++ will probably want to return 0xFF here.
352 	 */
getMaxCharValue(CodeGenerator generator)353 	public int getMaxCharValue(CodeGenerator generator) {
354 		return Label.MAX_CHAR_VALUE;
355 	}
356 
357 	/** Give target a chance to do some postprocessing on actions.
358 	 *  Python for example will have to fix the indention.
359 	 */
postProcessAction(List<Object> chunks, Token actionToken)360 	public List<Object> postProcessAction(List<Object> chunks, Token actionToken) {
361 		return chunks;
362 	}
363 
364 }
365