classfile/constant/Utf8Constant.java

/*
 * ProGuard -- shrinking, optimization, obfuscation, and preverification
 *             of Java bytecode.
 *
 * Copyright (c) 2002-2009 Eric Lafortune (eric@graphics.cornell.edu)
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package proguard.classfile.constant;

import proguard.classfile.*;
import proguard.classfile.constant.visitor.ConstantVisitor;

import java.io.UnsupportedEncodingException;

/**
 * This Constant represents a UTF-8 constant in the constant pool.
 *
 * @author Eric Lafortune
 */
public class Utf8Constant extends Constant
{
    private static final char TWO_BYTE_LIMIT     = 0x80;
    private static final int  TWO_BYTE_CONSTANT1 = 0xc0;
    private static final int  TWO_BYTE_CONSTANT2 = 0x80;
    private static final int  TWO_BYTE_SHIFT1    = 6;
    private static final int  TWO_BYTE_MASK1     = 0x1f;
    private static final int  TWO_BYTE_MASK2     = 0x3f;

    private static final char THREE_BYTE_LIMIT     = 0x800;
    private static final int  THREE_BYTE_CONSTANT1 = 0xe0;
    private static final int  THREE_BYTE_CONSTANT2 = 0x80;
    private static final int  THREE_BYTE_CONSTANT3 = 0x80;
    private static final int  THREE_BYTE_SHIFT1    = 12;
    private static final int  THREE_BYTE_SHIFT2    = 6;
    private static final int  THREE_BYTE_MASK1     = 0x0f;
    private static final int  THREE_BYTE_MASK2     = 0x3f;
    private static final int  THREE_BYTE_MASK3     = 0x3f;


    // There are a lot of Utf8Constant objects, so we're optimising their storage.
    // Initially, we're storing the UTF-8 bytes in a byte array.
    // When the corresponding String is requested, we ditch the array and just
    // store the String.

    //private int u2length;
    private byte[] bytes;

    private String string;


    /**
     * Creates an uninitialized Utf8Constant.
     *
     */
    public Utf8Constant()
    {
    }


    /**
     * Creates a Utf8Constant containing the given string.
     */
    public Utf8Constant(String string)
    {
        this.bytes  = null;
        this.string = string;
    }


    /**
     * Initializes the UTF-8 data with an array of bytes.
     */
    public void setBytes(byte[] bytes)
    {
        this.bytes  = bytes;
        this.string = null;
    }


    /**
     * Returns the UTF-8 data as an array of bytes.
     */
    public byte[] getBytes()
    {
        try
        {
            switchToByteArrayRepresentation();
        }
        catch (UnsupportedEncodingException ex)
        {
            throw new RuntimeException(ex.getMessage());
        }

        return bytes;
    }


    /**
     * Initializes the UTF-8 data with a String.
     */
    public void setString(String utf8String)
    {
        this.bytes  = null;
        this.string = utf8String;
    }


    /**
     * Returns the UTF-8 data as a String.
     */
    public String getString()
    {
        try
        {
            switchToStringRepresentation();
        }
        catch (UnsupportedEncodingException ex)
        {
            throw new RuntimeException(ex.getMessage());
        }

        return string;
    }


    // Implementations for Constant.

    public int getTag()
    {
        return ClassConstants.CONSTANT_Utf8;
    }

    public void accept(Clazz clazz, ConstantVisitor constantVisitor)
    {
        constantVisitor.visitUtf8Constant(clazz, this);
    }


    // Small utility methods.

    /**
     * Switches to a byte array representation of the UTF-8 data.
     */
    private void switchToByteArrayRepresentation() throws UnsupportedEncodingException
    {
        if (bytes == null)
        {
            bytes  = getByteArrayRepresentation(string);
            string = null;
        }
    }


    /**
     * Switches to a String representation of the UTF-8 data.
     */
    private void switchToStringRepresentation() throws UnsupportedEncodingException
    {
        if (string == null)
        {
            string = getStringRepresentation(bytes);
            bytes  = null;
        }
    }


    /**
     * Returns the modified UTF-8 byte array representation of the given string.
     */
    private byte[] getByteArrayRepresentation(String string) throws UnsupportedEncodingException
    {
        // We're computing the byte array ourselves, because the implementation
        // of String.getBytes("UTF-8") has a bug, at least up to JRE 1.4.2.
        // Also note the special treatment of the 0 character.

        // Compute the byte array length.
        int byteLength   = 0;
        int stringLength = string.length();
        for (int stringIndex = 0; stringIndex < stringLength; stringIndex++)
        {
            char c = string.charAt(stringIndex);

            // The character is represented by one, two, or three bytes.
            byteLength += c == 0                ? 2 :
                          c <  TWO_BYTE_LIMIT   ? 1 :
                          c <  THREE_BYTE_LIMIT ? 2 :
                                                  3;
        }

        // Allocate the byte array with the computed length.
        byte[] bytes  = new byte[byteLength];

        // Fill out the array.
        int byteIndex = 0;
        for (int stringIndex = 0; stringIndex < stringLength; stringIndex++)
        {
            char c = string.charAt(stringIndex);
            if (c == 0)
            {
                // The 0 character gets a two-byte representation in classes.
                bytes[byteIndex++] = (byte)TWO_BYTE_CONSTANT1;
                bytes[byteIndex++] = (byte)TWO_BYTE_CONSTANT2;
            }
            else if (c < TWO_BYTE_LIMIT)
            {
                // The character is represented by a single byte.
                bytes[byteIndex++] = (byte)c;
            }
            else if (c < THREE_BYTE_LIMIT)
            {
                // The character is represented by two bytes.
                bytes[byteIndex++] = (byte)(TWO_BYTE_CONSTANT1 | ((c >>> TWO_BYTE_SHIFT1) & TWO_BYTE_MASK1));
                bytes[byteIndex++] = (byte)(TWO_BYTE_CONSTANT2 | ( c                      & TWO_BYTE_MASK2));
            }
            else
            {
                // The character is represented by three bytes.
                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT1 | ((c >>> THREE_BYTE_SHIFT1) & THREE_BYTE_MASK1));
                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT2 | ((c >>> THREE_BYTE_SHIFT2) & THREE_BYTE_MASK2));
                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT3 | ( c                        & THREE_BYTE_MASK3));
            }
        }

        return bytes;
    }


    /**
     * Returns the String representation of the given modified UTF-8 byte array.
     */
    private String getStringRepresentation(byte[] bytes) throws UnsupportedEncodingException
    {
        // We're computing the string ourselves, because the implementation
        // of "new String(bytes)" doesn't honor the special treatment of
        // the 0 character in JRE 1.6_u11.

        // Allocate the byte array with the computed length.
        char[] chars  = new char[bytes.length];

        // Fill out the array.
        int charIndex = 0;
        int byteIndex = 0;
        while (byteIndex < bytes.length)
        {

            int b = bytes[byteIndex++] & 0xff;

            // Depending on the flag bits in the first byte, the character
            // is represented by a single byte, by two bytes, or by three
            // bytes. We're not checking the redundant flag bits in the
            // second byte and the third byte.
            try
            {
                chars[charIndex++] =
                    (char)(b < TWO_BYTE_CONSTANT1   ? b                                                          :

                           b < THREE_BYTE_CONSTANT1 ? ((b                  & TWO_BYTE_MASK1) << TWO_BYTE_SHIFT1) |
                                                      ((bytes[byteIndex++] & TWO_BYTE_MASK2)                   ) :

                                                      ((b                  & THREE_BYTE_MASK1) << THREE_BYTE_SHIFT1) |
                                                      ((bytes[byteIndex++] & THREE_BYTE_MASK2) << THREE_BYTE_SHIFT2) |
                                                      ((bytes[byteIndex++] & THREE_BYTE_MASK3)                     ));
            }
            catch (ArrayIndexOutOfBoundsException e)
            {
                throw new UnsupportedEncodingException("Missing UTF-8 bytes after initial byte [0x"+Integer.toHexString(b)+"] in string ["+new String(chars, 0, charIndex)+"]");
            }
        }

        return new String(chars, 0, charIndex);
    }
}