1/*
2 *  Licensed to the Apache Software Foundation (ASF) under one or more
3 *  contributor license agreements.  See the NOTICE file distributed with
4 *  this work for additional information regarding copyright ownership.
5 *  The ASF licenses this file to You under the Apache License, Version 2.0
6 *  (the "License"); you may not use this file except in compliance with
7 *  the License.  You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *  Unless required by applicable law or agreed to in writing, software
12 *  distributed under the License is distributed on an "AS IS" BASIS,
13 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *  See the License for the specific language governing permissions and
15 *  limitations under the License.
16 */
17
18package java.nio.charset;
19
20import java.io.UnsupportedEncodingException;
21import java.nio.ByteBuffer;
22import java.nio.CharBuffer;
23import java.nio.charset.spi.CharsetProvider;
24import java.util.Collections;
25import java.util.HashMap;
26import java.util.HashSet;
27import java.util.Iterator;
28import java.util.Locale;
29import java.util.ServiceLoader;
30import java.util.Set;
31import java.util.SortedMap;
32import java.util.TreeMap;
33import libcore.icu.NativeConverter;
34
35/**
36 * A charset is a named mapping between Unicode characters and byte sequences. Every
37 * {@code Charset} can <i>decode</i>, converting a byte sequence into a sequence of characters,
38 * and some can also <i>encode</i>, converting a sequence of characters into a byte sequence.
39 * Use the method {@link #canEncode} to find out whether a charset supports both.
40 *
41 * <h4>Characters</h4>
42 * <p>In the context of this class, <i>character</i> always refers to a Java character: a Unicode
43 * code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.)
44 * Not all byte sequences will represent a character, and not
45 * all characters can necessarily be represented by a given charset. The method {@link #contains}
46 * can be used to determine whether every character representable by one charset can also be
47 * represented by another (meaning that a lossless transformation is possible from the contained
48 * to the container).
49 *
50 * <h4>Encodings</h4>
51 * <p>There are many possible ways to represent Unicode characters as byte sequences.
52 * See <a href="http://www.unicode.org/reports/tr17/">UTR#17: Unicode Character Encoding Model</a>
53 * for detailed discussion.
54 *
55 * <p>The most important mappings capable of representing every character are the Unicode
56 * Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most
57 * common. UTF-8 (described in <a href="http://www.ietf.org/rfc/rfc3629.txt">RFC 3629</a>)
58 * encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially
59 * wasting space, but allowing efficient random access into BMP text), and UTF-32 uses
60 * exactly 4 bytes per character (trading off even more space for efficient random access into text
61 * that includes supplementary characters).
62 *
63 * <p>UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte
64 * integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or
65 * little-endian. To assist decoders, Unicode includes a special <i>byte order mark</i> (BOM)
66 * character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped
67 * code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees
68 * {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while
69 * {@code 0xff, 0xfe}, would indicate a little-endian byte sequence.
70 *
71 * <p>UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same
72 * byte sequence, there is no information about endianness to convey. Seeing the bytes
73 * corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to
74 * suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and
75 * will appear in the output character sequence. This means that a disadvantage to including a BOM
76 * in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a
77 * reason to prefer UTF-8: it's one less complication to worry about.)
78 *
79 * <p>Because a BOM indicates how the data that follows should be interpreted, a BOM should occur
80 * as the first character in a character sequence.
81 *
82 * <p>See the <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a> for
83 * more about dealing with BOMs.
84 *
85 * <h4>Endianness and BOM behavior</h4>
86 *
87 * <p>The following tables show the endianness and BOM behavior of the UTF-16 variants.
88 *
89 * <p>This table shows what the encoder writes. "BE" means that the byte sequence is big-endian,
90 * "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}).
91 * <p><table width="100%">
92 * <tr> <th>Charset</th>  <th>Encoder writes</th>  </tr>
93 * <tr> <td>UTF-16BE</td> <td>BE, no BOM</td>      </tr>
94 * <tr> <td>UTF-16LE</td> <td>LE, no BOM</td>      </tr>
95 * <tr> <td>UTF-16</td>   <td>BE, with BE BOM</td> </tr>
96 * </table>
97 *
98 * <p>The next table shows how each variant's decoder behaves when reading a byte sequence.
99 * The exact meaning of "failure" in the table is dependent on the
100 * {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so
101 * "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM
102 * triggers the malformedInputAction".
103 *
104 * <p>The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character.
105 *
106 * <p><table width="100%">
107 * <tr> <th>Charset</th>  <th>BE BOM</th>           <th>LE BOM</th>           <th>No BOM</th> </tr>
108 * <tr> <td>UTF-16BE</td> <td>BE, includes BOM</td> <td>BE, failure</td>      <td>BE</td>     </tr>
109 * <tr> <td>UTF-16LE</td> <td>LE, failure</td>      <td>LE, includes BOM</td> <td>LE</td>     </tr>
110 * <tr> <td>UTF-16</td>   <td>BE</td>               <td>LE</td>               <td>BE</td>     </tr>
111 * </table>
112 *
113 * <h4>Charset names</h4>
114 * <p>A charset has a canonical name, returned by {@link #name}. Most charsets will
115 * also have one or more aliases, returned by {@link #aliases}. A charset can be looked up
116 * by canonical name or any of its aliases using {@link #forName}.
117 *
118 * <h4>Guaranteed-available charsets</h4>
119 * <p>The following charsets are available on every Java implementation:
120 * <ul>
121 * <li>ISO-8859-1
122 * <li>US-ASCII
123 * <li>UTF-16
124 * <li>UTF-16BE
125 * <li>UTF-16LE
126 * <li>UTF-8
127 * </ul>
128 * <p>All of these charsets support both decoding and encoding. The charsets whose names begin
129 * "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets
130 * can only represent small subsets of these characters. Except when required to do otherwise for
131 * compatibility, new code should use one of the UTF charsets listed above. The platform's default
132 * charset is UTF-8. (This is in contrast to some older implementations, where the default charset
133 * depended on the user's locale.)
134 *
135 * <p>Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or
136 * {@link #isSupported} to see what's available. If you intend to use the charset if it's
137 * available, just call {@link #forName} and catch the exceptions it throws if the charset isn't
138 * available.
139 *
140 * <p>Additional charsets can be made available by configuring one or more charset
141 * providers through provider configuration files. Such files are always named
142 * as "java.nio.charset.spi.CharsetProvider" and located in the
143 * "META-INF/services" directory of one or more classpaths. The files should be
144 * encoded in "UTF-8". Each line of their content specifies the class name of a
145 * charset provider which extends {@link java.nio.charset.spi.CharsetProvider}.
146 * A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace
147 * is trimmed. Blank lines, and lines (after trimming) starting with "#" which are
148 * regarded as comments, are both ignored. Duplicates of names already found are also
149 * ignored. Both the configuration files and the provider classes will be loaded
150 * using the thread context class loader.
151 *
152 * <p>Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances
153 * it returns are inherently stateful.
154 */
155public abstract class Charset implements Comparable<Charset> {
156    private static final HashMap<String, Charset> CACHED_CHARSETS = new HashMap<String, Charset>();
157
158    private static final Charset DEFAULT_CHARSET = getDefaultCharset();
159
160    private final String canonicalName;
161
162    private final HashSet<String> aliasesSet;
163
164    /**
165     * Constructs a <code>Charset</code> object. Duplicated aliases are
166     * ignored.
167     *
168     * @param canonicalName
169     *            the canonical name of the charset.
170     * @param aliases
171     *            an array containing all aliases of the charset. May be null.
172     * @throws IllegalCharsetNameException
173     *             on an illegal value being supplied for either
174     *             <code>canonicalName</code> or for any element of
175     *             <code>aliases</code>.
176     */
177    protected Charset(String canonicalName, String[] aliases) {
178        // check whether the given canonical name is legal
179        checkCharsetName(canonicalName);
180        this.canonicalName = canonicalName;
181        // check each alias and put into a set
182        this.aliasesSet = new HashSet<String>();
183        if (aliases != null) {
184            for (String alias : aliases) {
185                checkCharsetName(alias);
186                this.aliasesSet.add(alias);
187            }
188        }
189    }
190
191    private static void checkCharsetName(String name) {
192        if (name.isEmpty()) {
193            throw new IllegalCharsetNameException(name);
194        }
195        int length = name.length();
196        for (int i = 0; i < length; ++i) {
197            if (!isValidCharsetNameCharacter(name.charAt(i))) {
198                throw new IllegalCharsetNameException(name);
199            }
200        }
201    }
202
203    private static boolean isValidCharsetNameCharacter(char c) {
204        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
205                c == '-' || c == '.' || c == ':' || c == '_';
206    }
207
208    /**
209     * Returns an immutable case-insensitive map from canonical names to {@code Charset} instances.
210     * If multiple charsets have the same canonical name, it is unspecified which is returned in
211     * the map. This method may be slow. If you know which charset you're looking for, use
212     * {@link #forName}.
213     * @return an immutable case-insensitive map from canonical names to {@code Charset} instances
214     */
215    public static SortedMap<String, Charset> availableCharsets() {
216        // Start with a copy of the built-in charsets...
217        TreeMap<String, Charset> charsets = new TreeMap<String, Charset>(String.CASE_INSENSITIVE_ORDER);
218        for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
219            Charset charset = NativeConverter.charsetForName(charsetName);
220            charsets.put(charset.name(), charset);
221        }
222
223        // Add all charsets provided by all charset providers...
224        for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
225            Iterator<Charset> it = charsetProvider.charsets();
226            while (it.hasNext()) {
227                Charset cs = it.next();
228                // A CharsetProvider can't override a built-in Charset.
229                if (!charsets.containsKey(cs.name())) {
230                    charsets.put(cs.name(), cs);
231                }
232            }
233        }
234
235        return Collections.unmodifiableSortedMap(charsets);
236    }
237
238    private static Charset cacheCharset(String charsetName, Charset cs) {
239        synchronized (CACHED_CHARSETS) {
240            // Get the canonical name for this charset, and the canonical instance from the table.
241            String canonicalName = cs.name();
242            Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName);
243            if (canonicalCharset == null) {
244                canonicalCharset = cs;
245            }
246
247            // Cache the charset by its canonical name...
248            CACHED_CHARSETS.put(canonicalName, canonicalCharset);
249
250            // And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means
251            // that many non-alias, non-canonical names are valid. For example, "utf8" isn't an
252            // alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of
253            // such names unduly.)
254            CACHED_CHARSETS.put(charsetName, canonicalCharset);
255
256            // And all its aliases...
257            for (String alias : cs.aliasesSet) {
258                CACHED_CHARSETS.put(alias, canonicalCharset);
259            }
260
261            return canonicalCharset;
262        }
263    }
264
265    /**
266     * Returns a {@code Charset} instance for the named charset.
267     *
268     * @param charsetName a charset name (either canonical or an alias)
269     * @throws IllegalCharsetNameException
270     *             if the specified charset name is illegal.
271     * @throws UnsupportedCharsetException
272     *             if the desired charset is not supported by this runtime.
273     */
274    public static Charset forName(String charsetName) {
275        // Is this charset in our cache?
276        Charset cs;
277        synchronized (CACHED_CHARSETS) {
278            cs = CACHED_CHARSETS.get(charsetName);
279            if (cs != null) {
280                return cs;
281            }
282        }
283
284        if (charsetName == null) {
285            throw new IllegalCharsetNameException(null);
286        }
287
288        // Is this a built-in charset supported by ICU?
289        checkCharsetName(charsetName);
290        cs = NativeConverter.charsetForName(charsetName);
291        if (cs != null) {
292            return cacheCharset(charsetName, cs);
293        }
294
295        // Does a configured CharsetProvider have this charset?
296        for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
297            cs = charsetProvider.charsetForName(charsetName);
298            if (cs != null) {
299                return cacheCharset(charsetName, cs);
300            }
301        }
302
303        throw new UnsupportedCharsetException(charsetName);
304    }
305
306    /**
307     * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
308     * which is all pre-nio code claims to throw.
309     *
310     * @hide internal use only
311     */
312    public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
313        try {
314            return Charset.forName(charsetName);
315        } catch (Exception cause) {
316            UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
317            ex.initCause(cause);
318            throw ex;
319        }
320    }
321
322    /**
323     * Determines whether the specified charset is supported by this runtime.
324     *
325     * @param charsetName
326     *            the name of the charset.
327     * @return true if the specified charset is supported, otherwise false.
328     * @throws IllegalCharsetNameException
329     *             if the specified charset name is illegal.
330     */
331    public static boolean isSupported(String charsetName) {
332        try {
333            forName(charsetName);
334            return true;
335        } catch (UnsupportedCharsetException ex) {
336            return false;
337        }
338    }
339
340    /**
341     * Determines whether this charset is a superset of the given charset. A charset C1 contains
342     * charset C2 if every character representable by C2 is also representable by C1. This means
343     * that lossless conversion is possible from C2 to C1 (but not necessarily the other way
344     * round). It does <i>not</i> imply that the two charsets use the same byte sequences for the
345     * characters they share.
346     *
347     * <p>Note that this method is allowed to be conservative, and some implementations may return
348     * false when this charset does contain the other charset. Android's implementation is precise,
349     * and will always return true in such cases.
350     *
351     * @param charset
352     *            a given charset.
353     * @return true if this charset is a super set of the given charset,
354     *         false if it's unknown or this charset is not a superset of
355     *         the given charset.
356     */
357    public abstract boolean contains(Charset charset);
358
359    /**
360     * Gets a new instance of an encoder for this charset.
361     *
362     * @return a new instance of an encoder for this charset.
363     */
364    public abstract CharsetEncoder newEncoder();
365
366    /**
367     * Gets a new instance of a decoder for this charset.
368     *
369     * @return a new instance of a decoder for this charset.
370     */
371    public abstract CharsetDecoder newDecoder();
372
373    /**
374     * Gets the canonical name of this charset.
375     *
376     * @return this charset's name in canonical form.
377     */
378    public final String name() {
379        return this.canonicalName;
380    }
381
382    /**
383     * Gets the set of this charset's aliases.
384     *
385     * @return an unmodifiable set of this charset's aliases.
386     */
387    public final Set<String> aliases() {
388        return Collections.unmodifiableSet(this.aliasesSet);
389    }
390
391    /**
392     * Gets the name of this charset for the default locale.
393     *
394     * <p>The default implementation returns the canonical name of this charset.
395     * Subclasses may return a localized display name.
396     *
397     * @return the name of this charset for the default locale.
398     */
399    public String displayName() {
400        return this.canonicalName;
401    }
402
403    /**
404     * Gets the name of this charset for the specified locale.
405     *
406     * <p>The default implementation returns the canonical name of this charset.
407     * Subclasses may return a localized display name.
408     *
409     * @param l
410     *            a certain locale
411     * @return the name of this charset for the specified locale
412     */
413    public String displayName(Locale l) {
414        return this.canonicalName;
415    }
416
417    /**
418     * Indicates whether this charset is known to be registered in the IANA
419     * Charset Registry.
420     *
421     * @return true if the charset is known to be registered, otherwise returns
422     *         false.
423     */
424    public final boolean isRegistered() {
425        return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-");
426    }
427
428    /**
429     * Returns true if this charset supports encoding, false otherwise.
430     *
431     * @return true if this charset supports encoding, false otherwise.
432     */
433    public boolean canEncode() {
434        return true;
435    }
436
437    /**
438     * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from
439     * {@code buffer}.
440     * This method uses {@code CodingErrorAction.REPLACE}.
441     *
442     * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
443     * for performance.
444     *
445     * @param buffer
446     *            the character buffer containing the content to be encoded.
447     * @return the result of the encoding.
448     */
449    public final ByteBuffer encode(CharBuffer buffer) {
450        try {
451            return newEncoder()
452                    .onMalformedInput(CodingErrorAction.REPLACE)
453                    .onUnmappableCharacter(CodingErrorAction.REPLACE).encode(
454                            buffer);
455        } catch (CharacterCodingException ex) {
456            throw new Error(ex.getMessage(), ex);
457        }
458    }
459
460    /**
461     * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}.
462     * This method uses {@code CodingErrorAction.REPLACE}.
463     *
464     * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
465     * for performance.
466     *
467     * @param s the string to be encoded.
468     * @return the result of the encoding.
469     */
470    public final ByteBuffer encode(String s) {
471        return encode(CharBuffer.wrap(s));
472    }
473
474    /**
475     * Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}.
476     * This method uses {@code CodingErrorAction.REPLACE}.
477     *
478     * <p>Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder}
479     * for performance.
480     *
481     * @param buffer
482     *            the byte buffer containing the content to be decoded.
483     * @return a character buffer containing the output of the decoding.
484     */
485    public final CharBuffer decode(ByteBuffer buffer) {
486        try {
487            return newDecoder()
488                    .onMalformedInput(CodingErrorAction.REPLACE)
489                    .onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer);
490        } catch (CharacterCodingException ex) {
491            throw new Error(ex.getMessage(), ex);
492        }
493    }
494
495    /*
496     * -------------------------------------------------------------------
497     * Methods implementing parent interface Comparable
498     * -------------------------------------------------------------------
499     */
500
501    /**
502     * Compares this charset with the given charset. This comparison is
503     * based on the case insensitive canonical names of the charsets.
504     *
505     * @param charset
506     *            the given object to be compared with.
507     * @return a negative integer if less than the given object, a positive
508     *         integer if larger than it, or 0 if equal to it.
509     */
510    public final int compareTo(Charset charset) {
511        return this.canonicalName.compareToIgnoreCase(charset.canonicalName);
512    }
513
514    /*
515     * -------------------------------------------------------------------
516     * Methods overriding parent class Object
517     * -------------------------------------------------------------------
518     */
519
520    /**
521     * Determines whether this charset equals to the given object. They are
522     * considered to be equal if they have the same canonical name.
523     *
524     * @param obj
525     *            the given object to be compared with.
526     * @return true if they have the same canonical name, otherwise false.
527     */
528    @Override
529    public final boolean equals(Object obj) {
530        if (obj instanceof Charset) {
531            Charset that = (Charset) obj;
532            return this.canonicalName.equals(that.canonicalName);
533        }
534        return false;
535    }
536
537    /**
538     * Gets the hash code of this charset.
539     *
540     * @return the hash code of this charset.
541     */
542    @Override
543    public final int hashCode() {
544        return this.canonicalName.hashCode();
545    }
546
547    /**
548     * Gets a string representation of this charset. Usually this contains the
549     * canonical name of the charset.
550     *
551     * @return a string representation of this charset.
552     */
553    @Override
554    public final String toString() {
555        return getClass().getName() + "[" + this.canonicalName + "]";
556    }
557
558    /**
559     * Returns the system's default charset. This is determined during VM startup, and will not
560     * change thereafter. On Android, the default charset is UTF-8.
561     */
562    public static Charset defaultCharset() {
563        return DEFAULT_CHARSET;
564    }
565
566    private static Charset getDefaultCharset() {
567        String encoding = System.getProperty("file.encoding", "UTF-8");
568        try {
569            return Charset.forName(encoding);
570        } catch (UnsupportedCharsetException e) {
571            return Charset.forName("UTF-8");
572        }
573    }
574}
575