1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package libcore.java.nio.charset;
18
19import java.nio.ByteBuffer;
20import java.nio.CharBuffer;
21import java.nio.charset.Charset;
22import java.nio.charset.CharsetEncoder;
23import java.util.Arrays;
24
25public class CharsetTest extends junit.framework.TestCase {
26    public void test_guaranteedCharsetsAvailable() throws Exception {
27        // All Java implementations must support these charsets.
28        assertNotNull(Charset.forName("ISO-8859-1"));
29        assertNotNull(Charset.forName("US-ASCII"));
30        assertNotNull(Charset.forName("UTF-16"));
31        assertNotNull(Charset.forName("UTF-16BE"));
32        assertNotNull(Charset.forName("UTF-16LE"));
33        assertNotNull(Charset.forName("UTF-8"));
34    }
35
36    public void test_allAvailableCharsets() throws Exception {
37        // Check that we can instantiate every Charset, CharsetDecoder, and CharsetEncoder.
38        for (String charsetName : Charset.availableCharsets().keySet()) {
39            if (charsetName.equals("UTF-32")) {
40                // Our UTF-32 is broken. http://b/2702411
41                // TODO: remove this hack when UTF-32 is fixed.
42                continue;
43            }
44
45            Charset cs = Charset.forName(charsetName);
46            assertNotNull(cs.newDecoder());
47            if (cs.canEncode()) {
48                CharsetEncoder enc = cs.newEncoder();
49                assertNotNull(enc);
50                assertNotNull(enc.replacement());
51            }
52        }
53    }
54
55    public void test_EUC_JP() throws Exception {
56        assertEncodes(Charset.forName("EUC-JP"), "\ufffd", 0xf4, 0xfe);
57    }
58
59    public void test_SCSU() throws Exception {
60        assertEncodes(Charset.forName("SCSU"), "\ufffd", 14, 0xff, 0xfd);
61    }
62
63    public void test_Shift_JIS() throws Exception {
64        assertEncodes(Charset.forName("Shift_JIS"), "\ufffd", 0xfc, 0xfc);
65    }
66
67    public void test_UTF_16() throws Exception {
68        Charset cs = Charset.forName("UTF-16");
69        // Writes big-endian, with a big-endian BOM.
70        assertEncodes(cs, "a\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66);
71        // Reads whatever the BOM tells it to read...
72        assertDecodes(cs, "a\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66);
73        assertDecodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06);
74        // ...and defaults to reading big-endian if there's no BOM.
75        assertDecodes(cs, "a\u0666", 0, 'a', 0x06, 0x66);
76    }
77
78    public void test_UTF_16BE() throws Exception {
79        Charset cs = Charset.forName("UTF-16BE");
80        // Writes big-endian, with no BOM.
81        assertEncodes(cs, "a\u0666", 0, 'a', 0x06, 0x66);
82        // Treats a little-endian BOM as an error and continues to read big-endian.
83        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
84        assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 'a', 0x06, 0x66);
85        // Accepts a big-endian BOM and includes U+FEFF in the decoded output.
86        assertDecodes(cs, "\ufeffa\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66);
87        // Defaults to reading big-endian.
88        assertDecodes(cs, "a\u0666", 0, 'a', 0x06, 0x66);
89    }
90
91    public void test_UTF_16LE() throws Exception {
92        Charset cs = Charset.forName("UTF-16LE");
93        // Writes little-endian, with no BOM.
94        assertEncodes(cs, "a\u0666", 'a', 0, 0x66, 0x06);
95        // Accepts a little-endian BOM and includes U+FEFF in the decoded output.
96        assertDecodes(cs, "\ufeffa\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06);
97        // Treats a big-endian BOM as an error and continues to read little-endian.
98        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
99        assertDecodes(cs, "\ufffda\u0666", 0xfe, 0xff, 'a', 0, 0x66, 0x06);
100        // Defaults to reading little-endian.
101        assertDecodes(cs, "a\u0666", 'a', 0, 0x66, 0x06);
102    }
103
104    public void test_x_UTF_16LE_BOM() throws Exception {
105        Charset cs = Charset.forName("x-UTF-16LE-BOM");
106        // Writes little-endian, with a BOM.
107        assertEncodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06);
108        // Accepts a little-endian BOM and swallows the BOM.
109        assertDecodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06);
110        // Swallows a big-endian BOM, but continues to read little-endian!
111        assertDecodes(cs, "\u6100\u6606", 0xfe, 0xff, 'a', 0, 0x66, 0x06);
112        // Defaults to reading little-endian.
113        assertDecodes(cs, "a\u0666", 'a', 0, 0x66, 0x06);
114    }
115
116    public void test_UTF_32() throws Exception {
117        Charset cs = Charset.forName("UTF-32");
118        // Writes big-endian, with no BOM.
119        assertEncodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
120        // Reads whatever the BOM tells it to read...
121        assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
122        assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
123        // ...and defaults to reading big-endian if there's no BOM.
124        assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
125    }
126
127    public void test_UTF_32BE() throws Exception {
128        Charset cs = Charset.forName("UTF-32BE");
129        // Writes big-endian, with no BOM.
130        assertEncodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
131        // Treats a little-endian BOM as an error and continues to read big-endian.
132        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
133        assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 0, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
134        // Accepts a big-endian BOM and swallows the BOM.
135        assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
136        // Defaults to reading big-endian.
137        assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
138    }
139
140    public void test_UTF_32LE() throws Exception {
141        Charset cs = Charset.forName("UTF-32LE");
142        // Writes little-endian, with no BOM.
143        assertEncodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
144        // Accepts a little-endian BOM and swallows the BOM.
145        assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
146        // Treats a big-endian BOM as an error and continues to read little-endian.
147        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
148        assertDecodes(cs, "\ufffda\u0666", 0, 0, 0xfe, 0xff, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
149        // Defaults to reading little-endian.
150        assertDecodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
151    }
152
153    public void test_X_UTF_32BE_BOM() throws Exception {
154        Charset cs = Charset.forName("X-UTF-32BE-BOM");
155        // Writes big-endian, with a big-endian BOM.
156        assertEncodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
157        // Treats a little-endian BOM as an error and continues to read big-endian.
158        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
159        assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 0, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
160        // Swallows a big-endian BOM, and continues to read big-endian.
161        assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
162        // Defaults to reading big-endian.
163        assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66);
164    }
165
166    public void test_X_UTF_32LE_BOM() throws Exception {
167        Charset cs = Charset.forName("X-UTF-32LE-BOM");
168        // Writes little-endian, with a little-endian BOM.
169        assertEncodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
170        // Accepts a little-endian BOM and swallows the BOM.
171        assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
172        // Treats a big-endian BOM as an error and continues to read little-endian.
173        // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result.
174        assertDecodes(cs, "\ufffda\u0666", 0, 0, 0xfe, 0xff, 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
175        // Defaults to reading little-endian.
176        assertDecodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0);
177    }
178
179    public void test_preNioAliases() throws Exception {
180        // Various pre-nio java.lang/java.io encoding names are translated to nio charsets.
181        assertEquals("UTF-16BE", Charset.forName("UnicodeBigUnmarked").name());
182        assertEquals("UTF-16LE", Charset.forName("UnicodeLittleUnmarked").name());
183        assertEquals("UTF-16", Charset.forName("Unicode").name());
184        assertEquals("UTF-16", Charset.forName("UnicodeBig").name());
185        assertEquals("x-UTF-16LE-BOM", Charset.forName("UnicodeLittle").name());
186        assertEquals("X-UTF-32BE-BOM", Charset.forName("UTF_32BE_BOM").name());
187        assertEquals("X-UTF-32LE-BOM", Charset.forName("UTF_32LE_BOM").name());
188    }
189
190    private byte[] toByteArray(int[] ints) {
191        byte[] result = new byte[ints.length];
192        for (int i = 0; i < ints.length; ++i) {
193            result[i] = (byte) ints[i];
194        }
195        return result;
196    }
197
198    private void assertEncodes(Charset cs, String s, int... expectedByteInts) throws Exception {
199        ByteBuffer out = cs.encode(s);
200        byte[] bytes = new byte[out.remaining()];
201        out.get(bytes);
202        assertEquals(Arrays.toString(toByteArray(expectedByteInts)), Arrays.toString(bytes));
203    }
204
205    private void assertDecodes(Charset cs, String s, int... byteInts) throws Exception {
206        ByteBuffer in = ByteBuffer.wrap(toByteArray(byteInts));
207        CharBuffer out = cs.decode(in);
208        assertEquals(s, out.toString());
209    }
210}
211