1 /* 2 * [The "BSD license"] 3 * Copyright (c) 2010 Terence Parr 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. The name of the author may not be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 package org.antlr.codegen; 29 30 import org.antlr.Tool; 31 import org.antlr.analysis.Label; 32 import org.antlr.runtime.Token; 33 import org.stringtemplate.v4.ST; 34 import org.antlr.tool.Grammar; 35 36 import java.io.IOException; 37 import java.util.List; 38 39 /** The code generator for ANTLR can usually be retargeted just by providing 40 * a new X.stg file for language X, however, sometimes the files that must 41 * be generated vary enough that some X-specific functionality is required. 42 * For example, in C, you must generate header files whereas in Java you do not. 43 * Other languages may want to keep DFA separate from the main 44 * generated recognizer file. 45 * 46 * The notion of a Code Generator target abstracts out the creation 47 * of the various files. As new language targets get added to the ANTLR 48 * system, this target class may have to be altered to handle more 49 * functionality. Eventually, just about all language generation issues 50 * will be expressible in terms of these methods. 51 * 52 * If org.antlr.codegen.XTarget class exists, it is used else 53 * Target base class is used. I am using a superclass rather than an 54 * interface for this target concept because I can add functionality 55 * later without breaking previously written targets (extra interface 56 * methods would force adding dummy functions to all code generator 57 * target classes). 58 * 59 */ 60 public class Target { 61 62 /** For pure strings of Java 16-bit unicode char, how can we display 63 * it in the target language as a literal. Useful for dumping 64 * predicates and such that may refer to chars that need to be escaped 65 * when represented as strings. Also, templates need to be escaped so 66 * that the target language can hold them as a string. 67 * 68 * I have defined (via the constructor) the set of typical escapes, 69 * but your Target subclass is free to alter the translated chars or 70 * add more definitions. This is nonstatic so each target can have 71 * a different set in memory at same time. 72 */ 73 protected String[] targetCharValueEscape = new String[255]; 74 Target()75 public Target() { 76 targetCharValueEscape['\n'] = "\\n"; 77 targetCharValueEscape['\r'] = "\\r"; 78 targetCharValueEscape['\t'] = "\\t"; 79 targetCharValueEscape['\b'] = "\\b"; 80 targetCharValueEscape['\f'] = "\\f"; 81 targetCharValueEscape['\\'] = "\\\\"; 82 targetCharValueEscape['\''] = "\\'"; 83 targetCharValueEscape['"'] = "\\\""; 84 } 85 useBaseTemplatesForSynPredFragments()86 public boolean useBaseTemplatesForSynPredFragments() { 87 return true; 88 } 89 genRecognizerFile(Tool tool, CodeGenerator generator, Grammar grammar, ST outputFileST)90 protected void genRecognizerFile(Tool tool, 91 CodeGenerator generator, 92 Grammar grammar, 93 ST outputFileST) 94 throws IOException 95 { 96 String fileName = 97 generator.getRecognizerFileName(grammar.name, grammar.type); 98 generator.write(outputFileST, fileName); 99 } 100 genRecognizerHeaderFile(Tool tool, CodeGenerator generator, Grammar grammar, ST headerFileST, String extName)101 protected void genRecognizerHeaderFile(Tool tool, 102 CodeGenerator generator, 103 Grammar grammar, 104 ST headerFileST, 105 String extName) // e.g., ".h" 106 throws IOException 107 { 108 // no header file by default 109 } 110 performGrammarAnalysis(CodeGenerator generator, Grammar grammar)111 protected void performGrammarAnalysis(CodeGenerator generator, 112 Grammar grammar) 113 { 114 // Build NFAs from the grammar AST 115 grammar.buildNFA(); 116 117 // Create the DFA predictors for each decision 118 grammar.createLookaheadDFAs(); 119 } 120 121 /** Is scope in @scope::name {action} valid for this kind of grammar? 122 * Targets like C++ may want to allow new scopes like headerfile or 123 * some such. The action names themselves are not policed at the 124 * moment so targets can add template actions w/o having to recompile 125 * ANTLR. 126 */ isValidActionScope(int grammarType, String scope)127 public boolean isValidActionScope(int grammarType, String scope) { 128 switch (grammarType) { 129 case Grammar.LEXER : 130 if ( scope.equals("lexer") ) {return true;} 131 break; 132 case Grammar.PARSER : 133 if ( scope.equals("parser") ) {return true;} 134 break; 135 case Grammar.COMBINED : 136 if ( scope.equals("parser") ) {return true;} 137 if ( scope.equals("lexer") ) {return true;} 138 break; 139 case Grammar.TREE_PARSER : 140 if ( scope.equals("treeparser") ) {return true;} 141 break; 142 } 143 return false; 144 } 145 146 /** Target must be able to override the labels used for token types */ getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype)147 public String getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype) { 148 String name = generator.grammar.getTokenDisplayName(ttype); 149 // If name is a literal, return the token type instead 150 if ( name.charAt(0)=='\'' ) { 151 return String.valueOf(ttype); 152 } 153 return name; 154 } 155 156 /** Convert from an ANTLR char literal found in a grammar file to 157 * an equivalent char literal in the target language. For most 158 * languages, this means leaving 'x' as 'x'. Actually, we need 159 * to escape '\u000A' so that it doesn't get converted to \n by 160 * the compiler. Convert the literal to the char value and then 161 * to an appropriate target char literal. 162 * 163 * Expect single quotes around the incoming literal. 164 */ getTargetCharLiteralFromANTLRCharLiteral( CodeGenerator generator, String literal)165 public String getTargetCharLiteralFromANTLRCharLiteral( 166 CodeGenerator generator, 167 String literal) 168 { 169 StringBuilder buf = new StringBuilder(); 170 buf.append('\''); 171 int c = Grammar.getCharValueFromGrammarCharLiteral(literal); 172 if ( c<Label.MIN_CHAR_VALUE ) { 173 return "'\u0000'"; 174 } 175 if ( c<targetCharValueEscape.length && 176 targetCharValueEscape[c]!=null ) 177 { 178 buf.append(targetCharValueEscape[c]); 179 } 180 else if ( Character.UnicodeBlock.of((char)c)== 181 Character.UnicodeBlock.BASIC_LATIN && 182 !Character.isISOControl((char)c) ) 183 { 184 // normal char 185 buf.append((char)c); 186 } 187 else { 188 // must be something unprintable...use \\uXXXX 189 // turn on the bit above max "\\uFFFF" value so that we pad with zeros 190 // then only take last 4 digits 191 String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); 192 buf.append("\\u"); 193 buf.append(hex); 194 } 195 196 buf.append('\''); 197 return buf.toString(); 198 } 199 200 /** Convert from an ANTLR string literal found in a grammar file to 201 * an equivalent string literal in the target language. For Java, this 202 * is the translation 'a\n"' → "a\n\"". Expect single quotes 203 * around the incoming literal. Just flip the quotes and replace 204 * double quotes with \" 205 * 206 * Note that we have decided to allow poeple to use '\"' without 207 * penalty, so we must build the target string in a loop as Utils.replae 208 * cannot handle both \" and " without a lot of messing around. 209 * 210 */ getTargetStringLiteralFromANTLRStringLiteral( CodeGenerator generator, String literal)211 public String getTargetStringLiteralFromANTLRStringLiteral( 212 CodeGenerator generator, 213 String literal) 214 { 215 StringBuilder sb = new StringBuilder(); 216 StringBuilder is = new StringBuilder(literal); 217 218 // Opening quote 219 // 220 sb.append('"'); 221 222 for (int i = 1; i < is.length() -1; i++) { 223 if (is.charAt(i) == '\\') { 224 // Anything escaped is what it is! We assume that 225 // people know how to escape characters correctly. However 226 // we catch anything that does not need an escape in Java (which 227 // is what the default implementation is dealing with and remove 228 // the escape. The C target does this for instance. 229 // 230 switch (is.charAt(i+1)) { 231 // Pass through any escapes that Java also needs 232 // 233 case '"': 234 case 'n': 235 case 'r': 236 case 't': 237 case 'b': 238 case 'f': 239 case '\\': 240 case 'u': // Assume unnnn 241 sb.append('\\'); // Pass the escape through 242 break; 243 default: 244 // Remove the escape by virtue of not adding it here 245 // Thus \' becomes ' and so on 246 // 247 break; 248 } 249 250 // Go past the \ character 251 // 252 i++; 253 } else { 254 // Chracters that don't need \ in ANTLR 'strings' but do in Java 255 // 256 if (is.charAt(i) == '"') { 257 // We need to escape " in Java 258 // 259 sb.append('\\'); 260 } 261 } 262 // Add in the next character, which may have been escaped 263 // 264 sb.append(is.charAt(i)); 265 } 266 267 // Append closing " and return 268 // 269 sb.append('"'); 270 271 return sb.toString(); 272 } 273 274 /** Given a random string of Java unicode chars, return a new string with 275 * optionally appropriate quote characters for target language and possibly 276 * with some escaped characters. For example, if the incoming string has 277 * actual newline characters, the output of this method would convert them 278 * to the two char sequence \n for Java, C, C++, ... The new string has 279 * double-quotes around it as well. Example String in memory: 280 * 281 * a"[newlinechar]b'c[carriagereturnchar]d[tab]e\f 282 * 283 * would be converted to the valid Java s: 284 * 285 * "a\"\nb'c\rd\te\\f" 286 * 287 * or 288 * 289 * a\"\nb'c\rd\te\\f 290 * 291 * depending on the quoted arg. 292 */ getTargetStringLiteralFromString(String s, boolean quoted)293 public String getTargetStringLiteralFromString(String s, boolean quoted) { 294 if ( s==null ) { 295 return null; 296 } 297 298 StringBuilder buf = new StringBuilder(); 299 if ( quoted ) { 300 buf.append('"'); 301 } 302 for (int i=0; i<s.length(); i++) { 303 int c = s.charAt(i); 304 if ( c!='\'' && // don't escape single quotes in strings for java 305 c<targetCharValueEscape.length && 306 targetCharValueEscape[c]!=null ) 307 { 308 buf.append(targetCharValueEscape[c]); 309 } 310 else { 311 buf.append((char)c); 312 } 313 } 314 if ( quoted ) { 315 buf.append('"'); 316 } 317 return buf.toString(); 318 } 319 getTargetStringLiteralFromString(String s)320 public String getTargetStringLiteralFromString(String s) { 321 return getTargetStringLiteralFromString(s, false); 322 } 323 324 /** Convert long to 0xNNNNNNNNNNNNNNNN by default for spitting out 325 * with bitsets. I.e., convert bytes to hex string. 326 */ getTarget64BitStringFromValue(long word)327 public String getTarget64BitStringFromValue(long word) { 328 int numHexDigits = 8*2; 329 StringBuilder buf = new StringBuilder(numHexDigits+2); 330 buf.append("0x"); 331 String digits = Long.toHexString(word); 332 digits = digits.toUpperCase(); 333 int padding = numHexDigits - digits.length(); 334 // pad left with zeros 335 for (int i=1; i<=padding; i++) { 336 buf.append('0'); 337 } 338 buf.append(digits); 339 return buf.toString(); 340 } 341 encodeIntAsCharEscape(int v)342 public String encodeIntAsCharEscape(int v) { 343 if ( v<=127 ) { 344 return "\\"+Integer.toOctalString(v); 345 } 346 String hex = Integer.toHexString(v|0x10000).substring(1,5); 347 return "\\u"+hex; 348 } 349 350 /** Some targets only support ASCII or 8-bit chars/strings. For example, 351 * C++ will probably want to return 0xFF here. 352 */ getMaxCharValue(CodeGenerator generator)353 public int getMaxCharValue(CodeGenerator generator) { 354 return Label.MAX_CHAR_VALUE; 355 } 356 357 /** Give target a chance to do some postprocessing on actions. 358 * Python for example will have to fix the indention. 359 */ postProcessAction(List<Object> chunks, Token actionToken)360 public List<Object> postProcessAction(List<Object> chunks, Token actionToken) { 361 return chunks; 362 } 363 364 } 365