1/**
2*******************************************************************************
3* Copyright (C) 2005-2012, International Business Machines Corporation and    *
4* others. All Rights Reserved.                                                *
5*******************************************************************************
6*/
7package com.ibm.icu.text;
8
9import java.io.ByteArrayInputStream;
10import java.io.IOException;
11import java.io.InputStream;
12import java.io.InputStreamReader;
13import java.io.Reader;
14
15
16/**
17 * This class represents a charset that has been identified by a CharsetDetector
18 * as a possible encoding for a set of input data.  From an instance of this
19 * class, you can ask for a confidence level in the charset identification,
20 * or for Java Reader or String to access the original byte data in Unicode form.
21 * <p/>
22 * Instances of this class are created only by CharsetDetectors.
23 * <p/>
24 * Note:  this class has a natural ordering that is inconsistent with equals.
25 *        The natural ordering is based on the match confidence value.
26 *
27 * @stable ICU 3.4
28 */
29public class CharsetMatch implements Comparable<CharsetMatch> {
30
31
32    /**
33     * Create a java.io.Reader for reading the Unicode character data corresponding
34     * to the original byte data supplied to the Charset detect operation.
35     * <p/>
36     * CAUTION:  if the source of the byte data was an InputStream, a Reader
37     * can be created for only one matching char set using this method.  If more
38     * than one charset needs to be tried, the caller will need to reset
39     * the InputStream and create InputStreamReaders itself, based on the charset name.
40     *
41     * @return the Reader for the Unicode character data.
42     *
43     * @stable ICU 3.4
44     */
45    public Reader getReader() {
46        InputStream inputStream = fInputStream;
47
48        if (inputStream == null) {
49            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
50        }
51
52        try {
53            inputStream.reset();
54            return new InputStreamReader(inputStream, getName());
55        } catch (IOException e) {
56            return null;
57        }
58    }
59
60    /**
61     * Create a Java String from Unicode character data corresponding
62     * to the original byte data supplied to the Charset detect operation.
63     *
64     * @return a String created from the converted input data.
65     *
66     * @stable ICU 3.4
67     */
68    public String getString()  throws java.io.IOException {
69        return getString(-1);
70
71    }
72
73    /**
74     * Create a Java String from Unicode character data corresponding
75     * to the original byte data supplied to the Charset detect operation.
76     * The length of the returned string is limited to the specified size;
77     * the string will be trunctated to this length if necessary.  A limit value of
78     * zero or less is ignored, and treated as no limit.
79     *
80     * @param maxLength The maximium length of the String to be created when the
81     *                  source of the data is an input stream, or -1 for
82     *                  unlimited length.
83     * @return a String created from the converted input data.
84     *
85     * @stable ICU 3.4
86     */
87    public String getString(int maxLength) throws java.io.IOException {
88        String result = null;
89        if (fInputStream != null) {
90            StringBuilder sb = new StringBuilder();
91            char[] buffer = new char[1024];
92            Reader reader = getReader();
93            int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
94            int bytesRead = 0;
95
96            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
97                sb.append(buffer, 0, bytesRead);
98                max -= bytesRead;
99            }
100
101            reader.close();
102
103            return sb.toString();
104        } else {
105            String name = getName();
106            /*
107             * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
108             * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
109             * should be stripped off before creating the string.
110             */
111            int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
112            if (startSuffix > 0) {
113                name = name.substring(0, startSuffix);
114            }
115            result = new String(fRawInput, name);
116        }
117        return result;
118
119    }
120
121    /**
122     * Get an indication of the confidence in the charset detected.
123     * Confidence values range from 0-100, with larger numbers indicating
124     * a better match of the input data to the characteristics of the
125     * charset.
126     *
127     * @return the confidence in the charset match
128     *
129     * @stable ICU 3.4
130     */
131    public int getConfidence() {
132        return fConfidence;
133    }
134
135    /**
136     * Get the name of the detected charset.
137     * The name will be one that can be used with other APIs on the
138     * platform that accept charset names.  It is the "Canonical name"
139     * as defined by the class java.nio.charset.Charset; for
140     * charsets that are registered with the IANA charset registry,
141     * this is the MIME-preferred registerd name.
142     *
143     * @see java.nio.charset.Charset
144     * @see java.io.InputStreamReader
145     *
146     * @return The name of the charset.
147     *
148     * @stable ICU 3.4
149     */
150    public String getName() {
151        return fCharsetName;
152    }
153
154    /**
155     * Get the ISO code for the language of the detected charset.
156     *
157     * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
158     *
159     * @stable ICU 3.4
160     */
161    public String getLanguage() {
162        return fLang;
163    }
164
165    /**
166     * Compare to other CharsetMatch objects.
167     * Comparison is based on the match confidence value, which
168     *   allows CharsetDetector.detectAll() to order its results.
169     *
170     * @param other the CharsetMatch object to compare against.
171     * @return  a negative integer, zero, or a positive integer as the
172     *          confidence level of this CharsetMatch
173     *          is less than, equal to, or greater than that of
174     *          the argument.
175     * @throws ClassCastException if the argument is not a CharsetMatch.
176     * @stable ICU 4.4
177     */
178    public int compareTo (CharsetMatch other) {
179        int compareResult = 0;
180        if (this.fConfidence > other.fConfidence) {
181            compareResult = 1;
182        } else if (this.fConfidence < other.fConfidence) {
183            compareResult = -1;
184        }
185        return compareResult;
186    }
187
188    /*
189     *  Constructor.  Implementation internal
190     */
191    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
192        fConfidence = conf;
193
194        // The references to the original application input data must be copied out
195        //   of the charset recognizer to here, in case the application resets the
196        //   recognizer before using this CharsetMatch.
197        if (det.fInputStream == null) {
198            // We only want the existing input byte data if it came straight from the user,
199            //   not if is just the head of a stream.
200            fRawInput    = det.fRawInput;
201            fRawLength   = det.fRawLength;
202        }
203        fInputStream = det.fInputStream;
204        fCharsetName = rec.getName();
205        fLang = rec.getLanguage();
206    }
207
208    /*
209     *  Constructor.  Implementation internal
210     */
211    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
212        fConfidence = conf;
213
214        // The references to the original application input data must be copied out
215        //   of the charset recognizer to here, in case the application resets the
216        //   recognizer before using this CharsetMatch.
217        if (det.fInputStream == null) {
218            // We only want the existing input byte data if it came straight from the user,
219            //   not if is just the head of a stream.
220            fRawInput    = det.fRawInput;
221            fRawLength   = det.fRawLength;
222        }
223        fInputStream = det.fInputStream;
224        fCharsetName = csName;
225        fLang = lang;
226    }
227
228
229    //
230    //   Private Data
231    //
232    private int                 fConfidence;
233    private byte[]              fRawInput = null;     // Original, untouched input bytes.
234                                                      //  If user gave us a byte array, this is it.
235    private int                 fRawLength;           // Length of data in fRawInput array.
236
237    private InputStream         fInputStream = null;  // User's input stream, or null if the user
238                                                      //   gave us a byte array.
239
240    private String              fCharsetName;         // The name of the charset this CharsetMatch
241                                                      //   represents.  Filled in by the recognizer.
242    private String              fLang;                // The language, if one was determined by
243                                                      //   the recognizer during the detect operation.
244}
245