antlr/codegen/RubyTarget.java

/*
 [The "BSD license"]
 Copyright (c) 2010 Kyle Yetter
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.
 3. The name of the author may not be used to endorse or promote products
    derived from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.antlr.codegen;

import org.antlr.Tool;
import org.antlr.tool.Grammar;
import org.stringtemplate.v4.AttributeRenderer;
import org.stringtemplate.v4.ST;
import org.stringtemplate.v4.STGroup;

import java.io.IOException;
import java.util.*;

public class RubyTarget extends Target
{
    /** A set of ruby keywords which are used to escape labels and method names
     *  which will cause parse errors in the ruby source
     */
    public static final Set rubyKeywords =
    new HashSet() {
        {
        	add( "alias" );     add( "END" );     add( "retry" );
        	add( "and" );       add( "ensure" );  add( "return" );
        	add( "BEGIN" );     add( "false" );   add( "self" );
        	add( "begin" );     add( "for" );     add( "super" );
        	add( "break" );     add( "if" );      add( "then" );
        	add( "case" );      add( "in" );      add( "true" );
        	add( "class" );     add( "module" );  add( "undef" );
        	add( "def" );       add( "next" );    add( "unless" );
        	add( "defined?" );  add( "nil" );     add( "until" );
        	add( "do" );        add( "not" );     add( "when" );
        	add( "else" );      add( "or" );      add( "while" );
        	add( "elsif" );     add( "redo" );    add( "yield" );
        	add( "end" );       add( "rescue" );
        }
    };

    public static Map<String, Map<String, Object>> sharedActionBlocks = new HashMap<String, Map<String, Object>>();

    public class RubyRenderer implements AttributeRenderer
    {
    	protected String[] rubyCharValueEscape = new String[256];

    	public RubyRenderer() {
    		for ( int i = 0; i < 16; i++ ) {
    			rubyCharValueEscape[ i ] = "\\x0" + Integer.toHexString( i );
    		}
    		for ( int i = 16; i < 32; i++ ) {
    			rubyCharValueEscape[ i ] = "\\x" + Integer.toHexString( i );
    		}
    		for ( char i = 32; i < 127; i++ ) {
    			rubyCharValueEscape[ i ] = Character.toString( i );
    		}
    		for ( int i = 127; i < 256; i++ ) {
    			rubyCharValueEscape[ i ] = "\\x" + Integer.toHexString( i );
    		}

    		rubyCharValueEscape['\n'] = "\\n";
    		rubyCharValueEscape['\r'] = "\\r";
    		rubyCharValueEscape['\t'] = "\\t";
    		rubyCharValueEscape['\b'] = "\\b";
    		rubyCharValueEscape['\f'] = "\\f";
    		rubyCharValueEscape['\\'] = "\\\\";
    		rubyCharValueEscape['"'] = "\\\"";
    	}

        public String toString( Object o, String formatName, Locale locale ) {
			if ( formatName==null ) {
				return o.toString();
			}

            String idString = o.toString();

            if ( idString.isEmpty() ) return idString;

            if ( formatName.equals( "snakecase" ) ) {
                return snakecase( idString );
            } else if ( formatName.equals( "camelcase" ) ) {
                return camelcase( idString );
            } else if ( formatName.equals( "subcamelcase" ) ) {
                return subcamelcase( idString );
            } else if ( formatName.equals( "constant" ) ) {
                return constantcase( idString );
            } else if ( formatName.equals( "platform" ) ) {
                return platform( idString );
            } else if ( formatName.equals( "lexerRule" ) ) {
                return lexerRule( idString );
            } else if ( formatName.equals( "constantPath" ) ) {
            	return constantPath( idString );
            } else if ( formatName.equals( "rubyString" ) ) {
                return rubyString( idString );
            } else if ( formatName.equals( "label" ) ) {
                return label( idString );
            } else if ( formatName.equals( "symbol" ) ) {
                return symbol( idString );
            } else {
                throw new IllegalArgumentException( "Unsupported format name" );
            }
        }

        /** given an input string, which is presumed
         * to contain a word, which may potentially be camelcased,
         * and convert it to snake_case underscore style.
         *
         * algorithm --
         *   iterate through the string with a sliding window 3 chars wide
         *
         * example -- aGUIWhatNot
         *   c   c+1 c+2  action
         *   a   G        << 'a' << '_'  // a lower-upper word edge
         *   G   U   I    << 'g'
         *   U   I   W    << 'w'
         *   I   W   h    << 'i' << '_'  // the last character in an acronym run of uppers
         *   W   h        << 'w'
         *   ... and so on
         */
        private String snakecase( String value ) {
            StringBuilder output_buffer = new StringBuilder();
            int l = value.length();
            int cliff = l - 1;
            char cur;
            char next;
            char peek;

            if ( value.isEmpty() ) return value;
            if ( l == 1 ) return value.toLowerCase();

            for ( int i = 0; i < cliff; i++ ) {
                cur  = value.charAt( i );
                next = value.charAt( i + 1 );

                if ( Character.isLetter( cur ) ) {
                    output_buffer.append( Character.toLowerCase( cur ) );

                    if ( Character.isDigit( next ) || Character.isWhitespace( next ) ) {
                        output_buffer.append( '_' );
                    } else if ( Character.isLowerCase( cur ) && Character.isUpperCase( next ) ) {
                        // at camelcase word edge
                        output_buffer.append( '_' );
                    } else if ( ( i < cliff - 1 ) && Character.isUpperCase( cur ) && Character.isUpperCase( next ) ) {
                        // cur is part of an acronym

                        peek = value.charAt( i + 2 );
                        if ( Character.isLowerCase( peek ) ) {
                            /* if next is the start of word (indicated when peek is lowercase)
                                         then the acronym must be completed by appending an underscore */
                            output_buffer.append( '_' );
                        }
                    }
                } else if ( Character.isDigit( cur ) ) {
                    output_buffer.append( cur );
                    if ( Character.isLetter( next ) ) {
                        output_buffer.append( '_' );
                    }
                } else if ( Character.isWhitespace( cur ) ) {
                    // do nothing
                } else {
                    output_buffer.append( cur );
                }

            }

            cur  = value.charAt( cliff );
            if ( ! Character.isWhitespace( cur ) ) {
                output_buffer.append( Character.toLowerCase( cur ) );
            }

            return output_buffer.toString();
        }

        private String constantcase( String value ) {
            return snakecase( value ).toUpperCase();
        }

        private String platform( String value ) {
            return ( "__" + value + "__" );
        }

        private String symbol( String value ) {
            if ( value.matches( "[a-zA-Z_]\\w*[\\?\\!\\=]?" ) ) {
                return ( ":" + value );
            } else {
                return ( "%s(" + value + ")" );
            }
        }

        private String lexerRule( String value ) {
					  // System.out.print( "lexerRule( \"" + value + "\") => " );
            if ( value.equals( "Tokens" ) ) {
							  // System.out.println( "\"token!\"" );
                return "token!";
            } else {
							  // String result = snakecase( value ) + "!";
								// System.out.println( "\"" + result + "\"" );
                return ( snakecase( value ) + "!" );
            }
        }

        private String constantPath( String value ) {
            return value.replaceAll( "\\.", "::" );
        }

        private String rubyString( String value ) {
        	StringBuilder output_buffer = new StringBuilder();
        	int len = value.length();

        	output_buffer.append( '"' );
        	for ( int i = 0; i < len; i++ ) {
        		output_buffer.append( rubyCharValueEscape[ value.charAt( i ) ] );
        	}
        	output_buffer.append( '"' );
        	return output_buffer.toString();
        }

        private String camelcase( String value ) {
            StringBuilder output_buffer = new StringBuilder();
            int cliff = value.length();
            char cur;
            char next;
            boolean at_edge = true;

            if ( value.isEmpty() ) return value;
            if ( cliff == 1 ) return value.toUpperCase();

            for ( int i = 0; i < cliff; i++ ) {
                cur  = value.charAt( i );

                if ( Character.isWhitespace( cur ) ) {
                    at_edge = true;
                    continue;
                } else if ( cur == '_' ) {
                    at_edge = true;
                    continue;
                } else if ( Character.isDigit( cur ) ) {
                    output_buffer.append( cur );
                    at_edge = true;
                    continue;
                }

                if ( at_edge ) {
                    output_buffer.append( Character.toUpperCase( cur ) );
                    if ( Character.isLetter( cur ) ) at_edge = false;
                } else {
                    output_buffer.append( cur );
                }
            }

            return output_buffer.toString();
        }

        private String label( String value ) {
            if ( rubyKeywords.contains( value ) ) {
                return platform( value );
            } else if ( Character.isUpperCase( value.charAt( 0 ) ) &&
                        ( !value.equals( "FILE" ) ) &&
                        ( !value.equals( "LINE" ) ) ) {
                return platform( value );
            } else if ( value.equals( "FILE" ) ) {
                return "_FILE_";
            } else if ( value.equals( "LINE" ) ) {
                return "_LINE_";
            } else {
                return value;
            }
        }

        private String subcamelcase( String value ) {
            value = camelcase( value );
            if ( value.isEmpty() )
                return value;
            Character head = Character.toLowerCase( value.charAt( 0 ) );
            String tail = value.substring( 1 );
            return head.toString().concat( tail );
        }
    }

    protected void genRecognizerFile(
    		Tool tool,
    		CodeGenerator generator,
    		Grammar grammar,
    		ST outputFileST
    ) throws IOException
    {
        /*
            Below is an experimental attempt at providing a few named action blocks
            that are printed in both lexer and parser files from combined grammars.
            ANTLR appears to first generate a parser, then generate an independent lexer,
            and then generate code from that. It keeps the combo/parser grammar object
            and the lexer grammar object, as well as their respective code generator and
            target instances, completely independent. So, while a bit hack-ish, this is
            a solution that should work without having to modify Terrence Parr's
            core tool code.

            - sharedActionBlocks is a class variable containing a hash map
            - if this method is called with a combo grammar, and the action map
              in the grammar contains an entry for the named scope "all",
              add an entry to sharedActionBlocks mapping the grammar name to
              the "all" action map.
            - if this method is called with an `implicit lexer'
              (one that's extracted from a combo grammar), check to see if
              there's an entry in sharedActionBlocks for the lexer's grammar name.
            - if there is an action map entry, place it in the lexer's action map
            - the recognizerFile template has code to place the
              "all" actions appropriately

            problems:
              - This solution assumes that the parser will be generated
                before the lexer. If that changes at some point, this will
                not work.
              - I have not investigated how this works with delegation yet

            Kyle Yetter - March 25, 2010
        */

        if ( grammar.type == Grammar.COMBINED ) {
            Map<String, Map<String, Object>> actions = grammar.getActions();
            if ( actions.containsKey( "all" ) ) {
                sharedActionBlocks.put( grammar.name, actions.get( "all" ) );
            }
        } else if ( grammar.implicitLexer ) {
            if ( sharedActionBlocks.containsKey( grammar.name ) ) {
                Map<String, Map<String, Object>> actions = grammar.getActions();
                actions.put( "all", sharedActionBlocks.get( grammar.name ) );
            }
        }

        STGroup group = generator.getTemplates();
        RubyRenderer renderer = new RubyRenderer();
        try {
            group.registerRenderer( Class.forName( "java.lang.String" ), renderer );
        } catch ( ClassNotFoundException e ) {
            // this shouldn't happen
            System.err.println( "ClassNotFoundException: " + e.getMessage() );
            e.printStackTrace( System.err );
        }
        String fileName =
            generator.getRecognizerFileName( grammar.name, grammar.type );
        generator.write( outputFileST, fileName );
    }

    public String getTargetCharLiteralFromANTLRCharLiteral(
        CodeGenerator generator,
        String literal
    )
    {
        int code_point = 0;
        literal = literal.substring( 1, literal.length() - 1 );

        if ( literal.charAt( 0 ) == '\\' ) {
            switch ( literal.charAt( 1 ) ) {
                case    '\\':
                case    '"':
                case    '\'':
                    code_point = literal.codePointAt( 1 );
                    break;
                case    'n':
                    code_point = 10;
                    break;
                case    'r':
                    code_point = 13;
                    break;
                case    't':
                    code_point = 9;
                    break;
                case    'b':
                    code_point = 8;
                    break;
                case    'f':
                    code_point = 12;
                    break;
                case    'u':    // Assume unnnn
                    code_point = Integer.parseInt( literal.substring( 2 ), 16 );
                    break;
                default:
                    System.out.println( "1: hey you didn't account for this: \"" + literal + "\"" );
                    break;
            }
        } else if ( literal.length() == 1 ) {
            code_point = literal.codePointAt( 0 );
        } else {
            System.out.println( "2: hey you didn't account for this: \"" + literal + "\"" );
        }

        return ( "0x" + Integer.toHexString( code_point ) );
    }

    public int getMaxCharValue( CodeGenerator generator )
    {
        // Versions before 1.9 do not support unicode
        return 0xFF;
    }

    public String getTokenTypeAsTargetLabel( CodeGenerator generator, int ttype )
    {
        String name = generator.grammar.getTokenDisplayName( ttype );
        // If name is a literal, return the token type instead
        if ( name.charAt( 0 )=='\'' ) {
            return generator.grammar.computeTokenNameFromLiteral( ttype, name );
        }
        return name;
    }

    public boolean isValidActionScope( int grammarType, String scope ) {
        if ( scope.equals( "all" ) )       {
            return true;
        }
        if ( scope.equals( "token" ) )     {
            return true;
        }
        if ( scope.equals( "module" ) )    {
            return true;
        }
        if ( scope.equals( "overrides" ) ) {
            return true;
        }

        switch ( grammarType ) {
        case Grammar.LEXER:
            if ( scope.equals( "lexer" ) ) {
                return true;
            }
            break;
        case Grammar.PARSER:
            if ( scope.equals( "parser" ) ) {
                return true;
            }
            break;
        case Grammar.COMBINED:
            if ( scope.equals( "parser" ) ) {
                return true;
            }
            if ( scope.equals( "lexer" ) ) {
                return true;
            }
            break;
        case Grammar.TREE_PARSER:
            if ( scope.equals( "treeparser" ) ) {
                return true;
            }
            break;
        }
        return false;
    }

    public String encodeIntAsCharEscape( final int v ) {
        final int intValue;

        if ( v == 65535 ) {
            intValue = -1;
        } else {
            intValue = v;
        }

        return String.valueOf( intValue );
    }
}