1/**
2 * Copyright (c) 2008, http://www.snakeyaml.org
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package org.yaml.snakeyaml.reader;
17
18/**
19 version: 1.1 / 2007-01-25
20 - changed BOM recognition ordering (longer boms first)
21
22 Original pseudocode   : Thomas Weidenfeller
23 Implementation tweaked: Aki Nieminen
24 Implementation changed: Andrey Somov
25 * UTF-32 removed because it is not supported by YAML
26 * no default encoding
27
28 http://www.unicode.org/unicode/faq/utf_bom.html
29 BOMs:
30 00 00 FE FF    = UTF-32, big-endian
31 FF FE 00 00    = UTF-32, little-endian
32 EF BB BF       = UTF-8,
33 FE FF          = UTF-16, big-endian
34 FF FE          = UTF-16, little-endian
35
36 Win2k Notepad:
37 Unicode format = UTF-16LE
38 ***/
39
40import java.io.IOException;
41import java.io.InputStream;
42import java.io.InputStreamReader;
43import java.io.PushbackInputStream;
44import java.io.Reader;
45import java.nio.charset.Charset;
46import java.nio.charset.CharsetDecoder;
47import java.nio.charset.CodingErrorAction;
48
49/**
50 * Generic unicode textreader, which will use BOM mark to identify the encoding
51 * to be used. If BOM is not found then use a given default or system encoding.
52 */
53public class UnicodeReader extends Reader {
54    private static final Charset UTF8 = Charset.forName("UTF-8");
55    private static final Charset UTF16BE = Charset.forName("UTF-16BE");
56    private static final Charset UTF16LE = Charset.forName("UTF-16LE");
57
58    PushbackInputStream internalIn;
59    InputStreamReader internalIn2 = null;
60
61    private static final int BOM_SIZE = 3;
62
63    /**
64     * @param in
65     *            InputStream to be read
66     */
67    public UnicodeReader(InputStream in) {
68        internalIn = new PushbackInputStream(in, BOM_SIZE);
69    }
70
71    /**
72     * Get stream encoding or NULL if stream is uninitialized. Call init() or
73     * read() method to initialize it.
74     */
75    public String getEncoding() {
76        return internalIn2.getEncoding();
77    }
78
79    /**
80     * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
81     * back to the stream, only BOM bytes are skipped.
82     */
83    protected void init() throws IOException {
84        if (internalIn2 != null)
85            return;
86
87        Charset encoding;
88        byte bom[] = new byte[BOM_SIZE];
89        int n, unread;
90        n = internalIn.read(bom, 0, bom.length);
91
92        if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
93            encoding = UTF8;
94            unread = n - 3;
95        } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
96            encoding = UTF16BE;
97            unread = n - 2;
98        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
99            encoding = UTF16LE;
100            unread = n - 2;
101        } else {
102            // Unicode BOM mark not found, unread all bytes
103            encoding = UTF8;
104            unread = n;
105        }
106
107        if (unread > 0)
108            internalIn.unread(bom, (n - unread), unread);
109
110        // Use given encoding
111        CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter(
112                CodingErrorAction.REPORT);
113        internalIn2 = new InputStreamReader(internalIn, decoder);
114    }
115
116    public void close() throws IOException {
117        init();
118        internalIn2.close();
119    }
120
121    public int read(char[] cbuf, int off, int len) throws IOException {
122        init();
123        return internalIn2.read(cbuf, off, len);
124    }
125}