1/*
2 * [The "BSD license"]
3 *  Copyright (c) 2010 Terence Parr
4 *  All rights reserved.
5 *
6 *  Redistribution and use in source and binary forms, with or without
7 *  modification, are permitted provided that the following conditions
8 *  are met:
9 *  1. Redistributions of source code must retain the above copyright
10 *      notice, this list of conditions and the following disclaimer.
11 *  2. Redistributions in binary form must reproduce the above copyright
12 *      notice, this list of conditions and the following disclaimer in the
13 *      documentation and/or other materials provided with the distribution.
14 *  3. The name of the author may not be used to endorse or promote products
15 *      derived from this software without specific prior written permission.
16 *
17 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28package org.antlr.codegen;
29
30import org.antlr.Tool;
31import org.antlr.analysis.Label;
32import org.antlr.runtime.Token;
33import org.stringtemplate.v4.ST;
34import org.antlr.tool.Grammar;
35
36import java.io.IOException;
37import java.util.List;
38
39/** The code generator for ANTLR can usually be retargeted just by providing
40 *  a new X.stg file for language X, however, sometimes the files that must
41 *  be generated vary enough that some X-specific functionality is required.
42 *  For example, in C, you must generate header files whereas in Java you do not.
43 *  Other languages may want to keep DFA separate from the main
44 *  generated recognizer file.
45 *
46 *  The notion of a Code Generator target abstracts out the creation
47 *  of the various files.  As new language targets get added to the ANTLR
48 *  system, this target class may have to be altered to handle more
49 *  functionality.  Eventually, just about all language generation issues
50 *  will be expressible in terms of these methods.
51 *
52 *  If org.antlr.codegen.XTarget class exists, it is used else
53 *  Target base class is used.  I am using a superclass rather than an
54 *  interface for this target concept because I can add functionality
55 *  later without breaking previously written targets (extra interface
56 *  methods would force adding dummy functions to all code generator
57 *  target classes).
58 *
59 */
60public class Target {
61
62	/** For pure strings of Java 16-bit unicode char, how can we display
63	 *  it in the target language as a literal.  Useful for dumping
64	 *  predicates and such that may refer to chars that need to be escaped
65	 *  when represented as strings.  Also, templates need to be escaped so
66	 *  that the target language can hold them as a string.
67	 *
68	 *  I have defined (via the constructor) the set of typical escapes,
69	 *  but your Target subclass is free to alter the translated chars or
70	 *  add more definitions.  This is nonstatic so each target can have
71	 *  a different set in memory at same time.
72	 */
73	protected String[] targetCharValueEscape = new String[255];
74
75	public Target() {
76		targetCharValueEscape['\n'] = "\\n";
77		targetCharValueEscape['\r'] = "\\r";
78		targetCharValueEscape['\t'] = "\\t";
79		targetCharValueEscape['\b'] = "\\b";
80		targetCharValueEscape['\f'] = "\\f";
81		targetCharValueEscape['\\'] = "\\\\";
82		targetCharValueEscape['\''] = "\\'";
83		targetCharValueEscape['"'] = "\\\"";
84	}
85
86	protected void genRecognizerFile(Tool tool,
87									 CodeGenerator generator,
88									 Grammar grammar,
89									 ST outputFileST)
90		throws IOException
91	{
92		String fileName =
93			generator.getRecognizerFileName(grammar.name, grammar.type);
94		generator.write(outputFileST, fileName);
95	}
96
97	protected void genRecognizerHeaderFile(Tool tool,
98										   CodeGenerator generator,
99										   Grammar grammar,
100										   ST headerFileST,
101										   String extName) // e.g., ".h"
102		throws IOException
103	{
104		// no header file by default
105	}
106
107	protected void performGrammarAnalysis(CodeGenerator generator,
108										  Grammar grammar)
109	{
110		// Build NFAs from the grammar AST
111		grammar.buildNFA();
112
113		// Create the DFA predictors for each decision
114		grammar.createLookaheadDFAs();
115	}
116
117	/** Is scope in @scope::name {action} valid for this kind of grammar?
118	 *  Targets like C++ may want to allow new scopes like headerfile or
119	 *  some such.  The action names themselves are not policed at the
120	 *  moment so targets can add template actions w/o having to recompile
121	 *  ANTLR.
122	 */
123	public boolean isValidActionScope(int grammarType, String scope) {
124		switch (grammarType) {
125			case Grammar.LEXER :
126				if ( scope.equals("lexer") ) {return true;}
127				break;
128			case Grammar.PARSER :
129				if ( scope.equals("parser") ) {return true;}
130				break;
131			case Grammar.COMBINED :
132				if ( scope.equals("parser") ) {return true;}
133				if ( scope.equals("lexer") ) {return true;}
134				break;
135			case Grammar.TREE_PARSER :
136				if ( scope.equals("treeparser") ) {return true;}
137				break;
138		}
139		return false;
140	}
141
142	/** Target must be able to override the labels used for token types */
143	public String getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype) {
144		String name = generator.grammar.getTokenDisplayName(ttype);
145		// If name is a literal, return the token type instead
146		if ( name.charAt(0)=='\'' ) {
147			return String.valueOf(ttype);
148		}
149		return name;
150	}
151
152	/** Convert from an ANTLR char literal found in a grammar file to
153	 *  an equivalent char literal in the target language.  For most
154	 *  languages, this means leaving 'x' as 'x'.  Actually, we need
155	 *  to escape '\u000A' so that it doesn't get converted to \n by
156	 *  the compiler.  Convert the literal to the char value and then
157	 *  to an appropriate target char literal.
158	 *
159	 *  Expect single quotes around the incoming literal.
160	 */
161	public String getTargetCharLiteralFromANTLRCharLiteral(
162		CodeGenerator generator,
163		String literal)
164	{
165		StringBuffer buf = new StringBuffer();
166		buf.append('\'');
167		int c = Grammar.getCharValueFromGrammarCharLiteral(literal);
168		if ( c<Label.MIN_CHAR_VALUE ) {
169			return "'\u0000'";
170		}
171		if ( c<targetCharValueEscape.length &&
172			 targetCharValueEscape[c]!=null )
173		{
174			buf.append(targetCharValueEscape[c]);
175		}
176		else if ( Character.UnicodeBlock.of((char)c)==
177				  Character.UnicodeBlock.BASIC_LATIN &&
178				  !Character.isISOControl((char)c) )
179		{
180			// normal char
181			buf.append((char)c);
182		}
183		else {
184			// must be something unprintable...use \\uXXXX
185			// turn on the bit above max "\\uFFFF" value so that we pad with zeros
186			// then only take last 4 digits
187			String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
188			buf.append("\\u");
189			buf.append(hex);
190		}
191
192		buf.append('\'');
193		return buf.toString();
194	}
195
196	/** Convert from an ANTLR string literal found in a grammar file to
197	 *  an equivalent string literal in the target language.  For Java, this
198	 *  is the translation 'a\n"' -> "a\n\"".  Expect single quotes
199	 *  around the incoming literal.  Just flip the quotes and replace
200	 *  double quotes with \"
201     *
202     *  Note that we have decided to allow poeple to use '\"' without
203     *  penalty, so we must build the target string in a loop as Utils.replae
204     *  cannot handle both \" and " without a lot of messing around.
205     *
206	 */
207	public String getTargetStringLiteralFromANTLRStringLiteral(
208		CodeGenerator generator,
209		String literal)
210	{
211        StringBuilder sb = new StringBuilder();
212        StringBuffer is = new StringBuffer(literal);
213
214        // Opening quote
215        //
216        sb.append('"');
217
218        for (int i = 1; i < is.length() -1; i++) {
219            if  (is.charAt(i) == '\\') {
220                // Anything escaped is what it is! We assume that
221                // people know how to escape characters correctly. However
222                // we catch anything that does not need an escape in Java (which
223                // is what the default implementation is dealing with and remove
224                // the escape. The C target does this for instance.
225                //
226                switch (is.charAt(i+1)) {
227                    // Pass through any escapes that Java also needs
228                    //
229                    case    '"':
230                    case    'n':
231                    case    'r':
232                    case    't':
233                    case    'b':
234                    case    'f':
235                    case    '\\':
236                    case    'u':    // Assume unnnn
237                        sb.append('\\');    // Pass the escape through
238                        break;
239                    default:
240                        // Remove the escape by virtue of not adding it here
241                        // Thus \' becomes ' and so on
242                        //
243                        break;
244                }
245
246                // Go past the \ character
247                //
248                i++;
249            } else {
250                // Chracters that don't need \ in ANTLR 'strings' but do in Java
251                //
252                if (is.charAt(i) == '"') {
253                    // We need to escape " in Java
254                    //
255                    sb.append('\\');
256                }
257            }
258            // Add in the next character, which may have been escaped
259            //
260            sb.append(is.charAt(i));
261        }
262
263        // Append closing " and return
264        //
265        sb.append('"');
266
267		return sb.toString();
268	}
269
270	/** Given a random string of Java unicode chars, return a new string with
271	 *  optionally appropriate quote characters for target language and possibly
272	 *  with some escaped characters.  For example, if the incoming string has
273	 *  actual newline characters, the output of this method would convert them
274	 *  to the two char sequence \n for Java, C, C++, ...  The new string has
275	 *  double-quotes around it as well.  Example String in memory:
276	 *
277	 *     a"[newlinechar]b'c[carriagereturnchar]d[tab]e\f
278	 *
279	 *  would be converted to the valid Java s:
280	 *
281	 *     "a\"\nb'c\rd\te\\f"
282	 *
283	 *  or
284	 *
285	 *     a\"\nb'c\rd\te\\f
286	 *
287	 *  depending on the quoted arg.
288	 */
289	public String getTargetStringLiteralFromString(String s, boolean quoted) {
290		if ( s==null ) {
291			return null;
292		}
293
294		StringBuffer buf = new StringBuffer();
295		if ( quoted ) {
296			buf.append('"');
297		}
298		for (int i=0; i<s.length(); i++) {
299			int c = s.charAt(i);
300			if ( c!='\'' && // don't escape single quotes in strings for java
301				 c<targetCharValueEscape.length &&
302				 targetCharValueEscape[c]!=null )
303			{
304				buf.append(targetCharValueEscape[c]);
305			}
306			else {
307				buf.append((char)c);
308			}
309		}
310		if ( quoted ) {
311			buf.append('"');
312		}
313		return buf.toString();
314	}
315
316	public String getTargetStringLiteralFromString(String s) {
317		return getTargetStringLiteralFromString(s, false);
318	}
319
320	/** Convert long to 0xNNNNNNNNNNNNNNNN by default for spitting out
321	 *  with bitsets.  I.e., convert bytes to hex string.
322	 */
323	public String getTarget64BitStringFromValue(long word) {
324		int numHexDigits = 8*2;
325		StringBuffer buf = new StringBuffer(numHexDigits+2);
326		buf.append("0x");
327		String digits = Long.toHexString(word);
328		digits = digits.toUpperCase();
329		int padding = numHexDigits - digits.length();
330		// pad left with zeros
331		for (int i=1; i<=padding; i++) {
332			buf.append('0');
333		}
334		buf.append(digits);
335		return buf.toString();
336	}
337
338	public String encodeIntAsCharEscape(int v) {
339		if ( v<=127 ) {
340			return "\\"+Integer.toOctalString(v);
341		}
342		String hex = Integer.toHexString(v|0x10000).substring(1,5);
343		return "\\u"+hex;
344	}
345
346	/** Some targets only support ASCII or 8-bit chars/strings.  For example,
347	 *  C++ will probably want to return 0xFF here.
348	 */
349	public int getMaxCharValue(CodeGenerator generator) {
350		return Label.MAX_CHAR_VALUE;
351	}
352
353	/** Give target a chance to do some postprocessing on actions.
354	 *  Python for example will have to fix the indention.
355	 */
356	public List postProcessAction(List chunks, Token actionToken) {
357		return chunks;
358	}
359
360}
361