1/* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package libcore.java.nio.charset; 18 19import java.nio.ByteBuffer; 20import java.nio.CharBuffer; 21import java.nio.charset.Charset; 22import java.nio.charset.CharsetEncoder; 23import java.util.Arrays; 24 25public class CharsetTest extends junit.framework.TestCase { 26 public void test_guaranteedCharsetsAvailable() throws Exception { 27 // All Java implementations must support these charsets. 28 assertNotNull(Charset.forName("ISO-8859-1")); 29 assertNotNull(Charset.forName("US-ASCII")); 30 assertNotNull(Charset.forName("UTF-16")); 31 assertNotNull(Charset.forName("UTF-16BE")); 32 assertNotNull(Charset.forName("UTF-16LE")); 33 assertNotNull(Charset.forName("UTF-8")); 34 } 35 36 public void test_allAvailableCharsets() throws Exception { 37 // Check that we can instantiate every Charset, CharsetDecoder, and CharsetEncoder. 38 for (String charsetName : Charset.availableCharsets().keySet()) { 39 if (charsetName.equals("UTF-32")) { 40 // Our UTF-32 is broken. http://b/2702411 41 // TODO: remove this hack when UTF-32 is fixed. 42 continue; 43 } 44 45 Charset cs = Charset.forName(charsetName); 46 assertNotNull(cs.newDecoder()); 47 if (cs.canEncode()) { 48 CharsetEncoder enc = cs.newEncoder(); 49 assertNotNull(enc); 50 assertNotNull(enc.replacement()); 51 } 52 } 53 } 54 55 public void test_EUC_JP() throws Exception { 56 assertEncodes(Charset.forName("EUC-JP"), "\ufffd", 0xf4, 0xfe); 57 } 58 59 public void test_SCSU() throws Exception { 60 assertEncodes(Charset.forName("SCSU"), "\ufffd", 14, 0xff, 0xfd); 61 } 62 63 public void test_Shift_JIS() throws Exception { 64 assertEncodes(Charset.forName("Shift_JIS"), "\ufffd", 0xfc, 0xfc); 65 } 66 67 public void test_UTF_16() throws Exception { 68 Charset cs = Charset.forName("UTF-16"); 69 // Writes big-endian, with a big-endian BOM. 70 assertEncodes(cs, "a\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66); 71 // Reads whatever the BOM tells it to read... 72 assertDecodes(cs, "a\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66); 73 assertDecodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06); 74 // ...and defaults to reading big-endian if there's no BOM. 75 assertDecodes(cs, "a\u0666", 0, 'a', 0x06, 0x66); 76 } 77 78 public void test_UTF_16BE() throws Exception { 79 Charset cs = Charset.forName("UTF-16BE"); 80 // Writes big-endian, with no BOM. 81 assertEncodes(cs, "a\u0666", 0, 'a', 0x06, 0x66); 82 // Treats a little-endian BOM as an error and continues to read big-endian. 83 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 84 assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 'a', 0x06, 0x66); 85 // Accepts a big-endian BOM and includes U+FEFF in the decoded output. 86 assertDecodes(cs, "\ufeffa\u0666", 0xfe, 0xff, 0, 'a', 0x06, 0x66); 87 // Defaults to reading big-endian. 88 assertDecodes(cs, "a\u0666", 0, 'a', 0x06, 0x66); 89 } 90 91 public void test_UTF_16LE() throws Exception { 92 Charset cs = Charset.forName("UTF-16LE"); 93 // Writes little-endian, with no BOM. 94 assertEncodes(cs, "a\u0666", 'a', 0, 0x66, 0x06); 95 // Accepts a little-endian BOM and includes U+FEFF in the decoded output. 96 assertDecodes(cs, "\ufeffa\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06); 97 // Treats a big-endian BOM as an error and continues to read little-endian. 98 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 99 assertDecodes(cs, "\ufffda\u0666", 0xfe, 0xff, 'a', 0, 0x66, 0x06); 100 // Defaults to reading little-endian. 101 assertDecodes(cs, "a\u0666", 'a', 0, 0x66, 0x06); 102 } 103 104 public void test_x_UTF_16LE_BOM() throws Exception { 105 Charset cs = Charset.forName("x-UTF-16LE-BOM"); 106 // Writes little-endian, with a BOM. 107 assertEncodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06); 108 // Accepts a little-endian BOM and swallows the BOM. 109 assertDecodes(cs, "a\u0666", 0xff, 0xfe, 'a', 0, 0x66, 0x06); 110 // Swallows a big-endian BOM, but continues to read little-endian! 111 assertDecodes(cs, "\u6100\u6606", 0xfe, 0xff, 'a', 0, 0x66, 0x06); 112 // Defaults to reading little-endian. 113 assertDecodes(cs, "a\u0666", 'a', 0, 0x66, 0x06); 114 } 115 116 public void test_UTF_32() throws Exception { 117 Charset cs = Charset.forName("UTF-32"); 118 // Writes big-endian, with no BOM. 119 assertEncodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 120 // Reads whatever the BOM tells it to read... 121 assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 122 assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 123 // ...and defaults to reading big-endian if there's no BOM. 124 assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 125 } 126 127 public void test_UTF_32BE() throws Exception { 128 Charset cs = Charset.forName("UTF-32BE"); 129 // Writes big-endian, with no BOM. 130 assertEncodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 131 // Treats a little-endian BOM as an error and continues to read big-endian. 132 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 133 assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 0, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 134 // Accepts a big-endian BOM and swallows the BOM. 135 assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 136 // Defaults to reading big-endian. 137 assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 138 } 139 140 public void test_UTF_32LE() throws Exception { 141 Charset cs = Charset.forName("UTF-32LE"); 142 // Writes little-endian, with no BOM. 143 assertEncodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 144 // Accepts a little-endian BOM and swallows the BOM. 145 assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 146 // Treats a big-endian BOM as an error and continues to read little-endian. 147 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 148 assertDecodes(cs, "\ufffda\u0666", 0, 0, 0xfe, 0xff, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 149 // Defaults to reading little-endian. 150 assertDecodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 151 } 152 153 public void test_X_UTF_32BE_BOM() throws Exception { 154 Charset cs = Charset.forName("X-UTF-32BE-BOM"); 155 // Writes big-endian, with a big-endian BOM. 156 assertEncodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 157 // Treats a little-endian BOM as an error and continues to read big-endian. 158 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 159 assertDecodes(cs, "\ufffda\u0666", 0xff, 0xfe, 0, 0, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 160 // Swallows a big-endian BOM, and continues to read big-endian. 161 assertDecodes(cs, "a\u0666", 0, 0, 0xfe, 0xff, 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 162 // Defaults to reading big-endian. 163 assertDecodes(cs, "a\u0666", 0, 0, 0, 'a', 0, 0, 0x06, 0x66); 164 } 165 166 public void test_X_UTF_32LE_BOM() throws Exception { 167 Charset cs = Charset.forName("X-UTF-32LE-BOM"); 168 // Writes little-endian, with a little-endian BOM. 169 assertEncodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 170 // Accepts a little-endian BOM and swallows the BOM. 171 assertDecodes(cs, "a\u0666", 0xff, 0xfe, 0, 0, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 172 // Treats a big-endian BOM as an error and continues to read little-endian. 173 // This test uses REPLACE mode, so we get the U+FFFD replacement character in the result. 174 assertDecodes(cs, "\ufffda\u0666", 0, 0, 0xfe, 0xff, 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 175 // Defaults to reading little-endian. 176 assertDecodes(cs, "a\u0666", 'a', 0, 0, 0, 0x66, 0x06, 0, 0); 177 } 178 179 public void test_preNioAliases() throws Exception { 180 // Various pre-nio java.lang/java.io encoding names are translated to nio charsets. 181 assertEquals("UTF-16BE", Charset.forName("UnicodeBigUnmarked").name()); 182 assertEquals("UTF-16LE", Charset.forName("UnicodeLittleUnmarked").name()); 183 assertEquals("UTF-16", Charset.forName("Unicode").name()); 184 assertEquals("UTF-16", Charset.forName("UnicodeBig").name()); 185 assertEquals("x-UTF-16LE-BOM", Charset.forName("UnicodeLittle").name()); 186 assertEquals("X-UTF-32BE-BOM", Charset.forName("UTF_32BE_BOM").name()); 187 assertEquals("X-UTF-32LE-BOM", Charset.forName("UTF_32LE_BOM").name()); 188 } 189 190 private byte[] toByteArray(int[] ints) { 191 byte[] result = new byte[ints.length]; 192 for (int i = 0; i < ints.length; ++i) { 193 result[i] = (byte) ints[i]; 194 } 195 return result; 196 } 197 198 private void assertEncodes(Charset cs, String s, int... expectedByteInts) throws Exception { 199 ByteBuffer out = cs.encode(s); 200 byte[] bytes = new byte[out.remaining()]; 201 out.get(bytes); 202 assertEquals(Arrays.toString(toByteArray(expectedByteInts)), Arrays.toString(bytes)); 203 } 204 205 private void assertDecodes(Charset cs, String s, int... byteInts) throws Exception { 206 ByteBuffer in = ByteBuffer.wrap(toByteArray(byteInts)); 207 CharBuffer out = cs.decode(in); 208 assertEquals(s, out.toString()); 209 } 210} 211