1/*
2 * ProGuard -- shrinking, optimization, obfuscation, and preverification
3 *             of Java bytecode.
4 *
5 * Copyright (c) 2002-2009 Eric Lafortune (eric@graphics.cornell.edu)
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21package proguard.classfile.constant;
22
23import proguard.classfile.*;
24import proguard.classfile.constant.visitor.ConstantVisitor;
25
26import java.io.UnsupportedEncodingException;
27
28/**
29 * This Constant represents a UTF-8 constant in the constant pool.
30 *
31 * @author Eric Lafortune
32 */
33public class Utf8Constant extends Constant
34{
35    private static final char TWO_BYTE_LIMIT     = 0x80;
36    private static final int  TWO_BYTE_CONSTANT1 = 0xc0;
37    private static final int  TWO_BYTE_CONSTANT2 = 0x80;
38    private static final int  TWO_BYTE_SHIFT1    = 6;
39    private static final int  TWO_BYTE_MASK1     = 0x1f;
40    private static final int  TWO_BYTE_MASK2     = 0x3f;
41
42    private static final char THREE_BYTE_LIMIT     = 0x800;
43    private static final int  THREE_BYTE_CONSTANT1 = 0xe0;
44    private static final int  THREE_BYTE_CONSTANT2 = 0x80;
45    private static final int  THREE_BYTE_CONSTANT3 = 0x80;
46    private static final int  THREE_BYTE_SHIFT1    = 12;
47    private static final int  THREE_BYTE_SHIFT2    = 6;
48    private static final int  THREE_BYTE_MASK1     = 0x0f;
49    private static final int  THREE_BYTE_MASK2     = 0x3f;
50    private static final int  THREE_BYTE_MASK3     = 0x3f;
51
52
53    // There are a lot of Utf8Constant objects, so we're optimising their storage.
54    // Initially, we're storing the UTF-8 bytes in a byte array.
55    // When the corresponding String is requested, we ditch the array and just
56    // store the String.
57
58    //private int u2length;
59    private byte[] bytes;
60
61    private String string;
62
63
64    /**
65     * Creates an uninitialized Utf8Constant.
66     *
67     */
68    public Utf8Constant()
69    {
70    }
71
72
73    /**
74     * Creates a Utf8Constant containing the given string.
75     */
76    public Utf8Constant(String string)
77    {
78        this.bytes  = null;
79        this.string = string;
80    }
81
82
83    /**
84     * Initializes the UTF-8 data with an array of bytes.
85     */
86    public void setBytes(byte[] bytes)
87    {
88        this.bytes  = bytes;
89        this.string = null;
90    }
91
92
93    /**
94     * Returns the UTF-8 data as an array of bytes.
95     */
96    public byte[] getBytes()
97    {
98        try
99        {
100            switchToByteArrayRepresentation();
101        }
102        catch (UnsupportedEncodingException ex)
103        {
104            throw new RuntimeException(ex.getMessage());
105        }
106
107        return bytes;
108    }
109
110
111    /**
112     * Initializes the UTF-8 data with a String.
113     */
114    public void setString(String utf8String)
115    {
116        this.bytes  = null;
117        this.string = utf8String;
118    }
119
120
121    /**
122     * Returns the UTF-8 data as a String.
123     */
124    public String getString()
125    {
126        try
127        {
128            switchToStringRepresentation();
129        }
130        catch (UnsupportedEncodingException ex)
131        {
132            throw new RuntimeException(ex.getMessage());
133        }
134
135        return string;
136    }
137
138
139    // Implementations for Constant.
140
141    public int getTag()
142    {
143        return ClassConstants.CONSTANT_Utf8;
144    }
145
146    public void accept(Clazz clazz, ConstantVisitor constantVisitor)
147    {
148        constantVisitor.visitUtf8Constant(clazz, this);
149    }
150
151
152    // Small utility methods.
153
154    /**
155     * Switches to a byte array representation of the UTF-8 data.
156     */
157    private void switchToByteArrayRepresentation() throws UnsupportedEncodingException
158    {
159        if (bytes == null)
160        {
161            bytes  = getByteArrayRepresentation(string);
162            string = null;
163        }
164    }
165
166
167    /**
168     * Switches to a String representation of the UTF-8 data.
169     */
170    private void switchToStringRepresentation() throws UnsupportedEncodingException
171    {
172        if (string == null)
173        {
174            string = getStringRepresentation(bytes);
175            bytes  = null;
176        }
177    }
178
179
180    /**
181     * Returns the modified UTF-8 byte array representation of the given string.
182     */
183    private byte[] getByteArrayRepresentation(String string) throws UnsupportedEncodingException
184    {
185        // We're computing the byte array ourselves, because the implementation
186        // of String.getBytes("UTF-8") has a bug, at least up to JRE 1.4.2.
187        // Also note the special treatment of the 0 character.
188
189        // Compute the byte array length.
190        int byteLength   = 0;
191        int stringLength = string.length();
192        for (int stringIndex = 0; stringIndex < stringLength; stringIndex++)
193        {
194            char c = string.charAt(stringIndex);
195
196            // The character is represented by one, two, or three bytes.
197            byteLength += c == 0                ? 2 :
198                          c <  TWO_BYTE_LIMIT   ? 1 :
199                          c <  THREE_BYTE_LIMIT ? 2 :
200                                                  3;
201        }
202
203        // Allocate the byte array with the computed length.
204        byte[] bytes  = new byte[byteLength];
205
206        // Fill out the array.
207        int byteIndex = 0;
208        for (int stringIndex = 0; stringIndex < stringLength; stringIndex++)
209        {
210            char c = string.charAt(stringIndex);
211            if (c == 0)
212            {
213                // The 0 character gets a two-byte representation in classes.
214                bytes[byteIndex++] = (byte)TWO_BYTE_CONSTANT1;
215                bytes[byteIndex++] = (byte)TWO_BYTE_CONSTANT2;
216            }
217            else if (c < TWO_BYTE_LIMIT)
218            {
219                // The character is represented by a single byte.
220                bytes[byteIndex++] = (byte)c;
221            }
222            else if (c < THREE_BYTE_LIMIT)
223            {
224                // The character is represented by two bytes.
225                bytes[byteIndex++] = (byte)(TWO_BYTE_CONSTANT1 | ((c >>> TWO_BYTE_SHIFT1) & TWO_BYTE_MASK1));
226                bytes[byteIndex++] = (byte)(TWO_BYTE_CONSTANT2 | ( c                      & TWO_BYTE_MASK2));
227            }
228            else
229            {
230                // The character is represented by three bytes.
231                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT1 | ((c >>> THREE_BYTE_SHIFT1) & THREE_BYTE_MASK1));
232                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT2 | ((c >>> THREE_BYTE_SHIFT2) & THREE_BYTE_MASK2));
233                bytes[byteIndex++] = (byte)(THREE_BYTE_CONSTANT3 | ( c                        & THREE_BYTE_MASK3));
234            }
235        }
236
237        return bytes;
238    }
239
240
241    /**
242     * Returns the String representation of the given modified UTF-8 byte array.
243     */
244    private String getStringRepresentation(byte[] bytes) throws UnsupportedEncodingException
245    {
246        // We're computing the string ourselves, because the implementation
247        // of "new String(bytes)" doesn't honor the special treatment of
248        // the 0 character in JRE 1.6_u11.
249
250        // Allocate the byte array with the computed length.
251        char[] chars  = new char[bytes.length];
252
253        // Fill out the array.
254        int charIndex = 0;
255        int byteIndex = 0;
256        while (byteIndex < bytes.length)
257        {
258
259            int b = bytes[byteIndex++] & 0xff;
260
261            // Depending on the flag bits in the first byte, the character
262            // is represented by a single byte, by two bytes, or by three
263            // bytes. We're not checking the redundant flag bits in the
264            // second byte and the third byte.
265            try
266            {
267                chars[charIndex++] =
268                    (char)(b < TWO_BYTE_CONSTANT1   ? b                                                          :
269
270                           b < THREE_BYTE_CONSTANT1 ? ((b                  & TWO_BYTE_MASK1) << TWO_BYTE_SHIFT1) |
271                                                      ((bytes[byteIndex++] & TWO_BYTE_MASK2)                   ) :
272
273                                                      ((b                  & THREE_BYTE_MASK1) << THREE_BYTE_SHIFT1) |
274                                                      ((bytes[byteIndex++] & THREE_BYTE_MASK2) << THREE_BYTE_SHIFT2) |
275                                                      ((bytes[byteIndex++] & THREE_BYTE_MASK3)                     ));
276            }
277            catch (ArrayIndexOutOfBoundsException e)
278            {
279                throw new UnsupportedEncodingException("Missing UTF-8 bytes after initial byte [0x"+Integer.toHexString(b)+"] in string ["+new String(chars, 0, charIndex)+"]");
280            }
281        }
282
283        return new String(chars, 0, charIndex);
284    }
285}
286