CharsetDetector.java revision 1537b2f39245c07b00aa78c3600f7aebcb172490
1/* GENERATED SOURCE. DO NOT MODIFY. */
2/**
3*******************************************************************************
4* Copyright (C) 2005-2014, International Business Machines Corporation and    *
5* others. All Rights Reserved.                                                *
6*******************************************************************************
7*/
8package android.icu.text;
9
10import java.io.IOException;
11import java.io.InputStream;
12import java.io.Reader;
13import java.util.ArrayList;
14import java.util.Arrays;
15import java.util.Collections;
16import java.util.List;
17
18
19/**
20 * <code>CharsetDetector</code> provides a facility for detecting the
21 * charset or encoding of character data in an unknown format.
22 * The input data can either be from an input stream or an array of bytes.
23 * The result of the detection operation is a list of possibly matching
24 * charsets, or, for simple use, you can just ask for a Java Reader that
25 * will will work over the input data.
26 * <p/>
27 * Character set detection is at best an imprecise operation.  The detection
28 * process will attempt to identify the charset that best matches the characteristics
29 * of the byte data, but the process is partly statistical in nature, and
30 * the results can not be guaranteed to always be correct.
31 * <p/>
32 * For best accuracy in charset detection, the input data should be primarily
33 * in a single language, and a minimum of a few hundred bytes worth of plain text
34 * in the language are needed.  The detection process will attempt to
35 * ignore html or xml style markup that could otherwise obscure the content.
36 * <p/>
37 * @hide Only a subset of ICU is exposed in Android
38 * @hide All android.icu classes are currently hidden
39 */
40public class CharsetDetector {
41
42//   Question: Should we have getters corresponding to the setters for input text
43//   and declared encoding?
44
45//   A thought: If we were to create our own type of Java Reader, we could defer
46//   figuring out an actual charset for data that starts out with too much English
47//   only ASCII until the user actually read through to something that didn't look
48//   like 7 bit English.  If  nothing else ever appeared, we would never need to
49//   actually choose the "real" charset.  All assuming that the application just
50//   wants the data, and doesn't care about a char set name.
51
52    /**
53     *   Constructor
54     */
55    public CharsetDetector() {
56    }
57
58    /**
59     * Set the declared encoding for charset detection.
60     *  The declared encoding of an input text is an encoding obtained
61     *  from an http header or xml declaration or similar source that
62     *  can be provided as additional information to the charset detector.
63     *  A match between a declared encoding and a possible detected encoding
64     *  will raise the quality of that detected encoding by a small delta,
65     *  and will also appear as a "reason" for the match.
66     * <p/>
67     * A declared encoding that is incompatible with the input data being
68     * analyzed will not be added to the list of possible encodings.
69     *
70     *  @param encoding The declared encoding
71     */
72    public CharsetDetector setDeclaredEncoding(String encoding) {
73        fDeclaredEncoding = encoding;
74        return this;
75    }
76
77    /**
78     * Set the input text (byte) data whose charset is to be detected.
79     *
80     * @param in the input text of unknown encoding
81     *
82     * @return This CharsetDetector
83     */
84    public CharsetDetector setText(byte [] in) {
85        fRawInput  = in;
86        fRawLength = in.length;
87
88        return this;
89    }
90
91    private static final int kBufSize = 8000;
92
93    /**
94     * Set the input text (byte) data whose charset is to be detected.
95     *  <p/>
96     *   The input stream that supplies the character data must have markSupported()
97     *   == true; the charset detection process will read a small amount of data,
98     *   then return the stream to its original position via
99     *   the InputStream.reset() operation.  The exact amount that will
100     *   be read depends on the characteristics of the data itself.
101     *
102     * @param in the input text of unknown encoding
103     *
104     * @return This CharsetDetector
105     */
106
107    public CharsetDetector setText(InputStream in) throws IOException {
108        fInputStream = in;
109        fInputStream.mark(kBufSize);
110        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
111                                          //   previous one may have come from the caller,
112                                          //   in which case we can't touch it.
113        fRawLength = 0;
114        int remainingLength = kBufSize;
115        while (remainingLength > 0 ) {
116            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
117            int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
118            if (bytesRead <= 0) {
119                 break;
120            }
121            fRawLength += bytesRead;
122            remainingLength -= bytesRead;
123        }
124        fInputStream.reset();
125
126        return this;
127    }
128
129
130    /**
131     * Return the charset that best matches the supplied input data.
132     *
133     * Note though, that because the detection
134     * only looks at the start of the input data,
135     * there is a possibility that the returned charset will fail to handle
136     * the full set of input data.
137     * <p/>
138     * Raise an exception if
139     *  <ul>
140     *    <li>no charset appears to match the data.</li>
141     *    <li>no input text has been provided</li>
142     *  </ul>
143     *
144     * @return a CharsetMatch object representing the best matching charset, or
145     *         <code>null</code> if there are no matches.
146     */
147    public CharsetMatch detect() {
148//   TODO:  A better implementation would be to copy the detect loop from
149//          detectAll(), and cut it short as soon as a match with a high confidence
150//          is found.  This is something to be done later, after things are otherwise
151//          working.
152        CharsetMatch matches[] = detectAll();
153
154        if (matches == null || matches.length == 0) {
155            return null;
156        }
157
158        return matches[0];
159     }
160
161    /**
162     *  Return an array of all charsets that appear to be plausible
163     *  matches with the input data.  The array is ordered with the
164     *  best quality match first.
165     * <p/>
166     * Raise an exception if
167     *  <ul>
168     *    <li>no charsets appear to match the input data.</li>
169     *    <li>no input text has been provided</li>
170     *  </ul>
171     *
172     * @return An array of CharsetMatch objects representing possibly matching charsets.
173     */
174    public CharsetMatch[] detectAll() {
175        ArrayList<CharsetMatch>         matches = new ArrayList<CharsetMatch>();
176
177        MungeInput();  // Strip html markup, collect byte stats.
178
179        //  Iterate over all possible charsets, remember all that
180        //    give a match quality > 0.
181        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
182            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
183            boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
184            if (active) {
185                CharsetMatch m = rcinfo.recognizer.match(this);
186                if (m != null) {
187                    matches.add(m);
188                }
189            }
190        }
191        Collections.sort(matches);      // CharsetMatch compares on confidence
192        Collections.reverse(matches);   //  Put best match first.
193        CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
194        resultArray = matches.toArray(resultArray);
195        return resultArray;
196    }
197
198
199    /**
200     * Autodetect the charset of an inputStream, and return a Java Reader
201     * to access the converted input data.
202     * <p/>
203     * This is a convenience method that is equivalent to
204     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
205     * <p/>
206     *   For the input stream that supplies the character data, markSupported()
207     *   must be true; the  charset detection will read a small amount of data,
208     *   then return the stream to its original position via
209     *   the InputStream.reset() operation.  The exact amount that will
210     *    be read depends on the characteristics of the data itself.
211     *<p/>
212     * Raise an exception if no charsets appear to match the input data.
213     *
214     * @param in The source of the byte data in the unknown charset.
215     *
216     * @param declaredEncoding  A declared encoding for the data, if available,
217     *           or null or an empty string if none is available.
218     */
219    public Reader getReader(InputStream in, String declaredEncoding) {
220        fDeclaredEncoding = declaredEncoding;
221
222        try {
223            setText(in);
224
225            CharsetMatch match = detect();
226
227            if (match == null) {
228                return null;
229            }
230
231            return match.getReader();
232        } catch (IOException e) {
233            return null;
234        }
235    }
236
237    /**
238     * Autodetect the charset of an inputStream, and return a String
239     * containing the converted input data.
240     * <p/>
241     * This is a convenience method that is equivalent to
242     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
243     *<p/>
244     * Raise an exception if no charsets appear to match the input data.
245     *
246     * @param in The source of the byte data in the unknown charset.
247     *
248     * @param declaredEncoding  A declared encoding for the data, if available,
249     *           or null or an empty string if none is available.
250     */
251    public String getString(byte[] in, String declaredEncoding)
252    {
253        fDeclaredEncoding = declaredEncoding;
254
255        try {
256            setText(in);
257
258            CharsetMatch match = detect();
259
260            if (match == null) {
261                return null;
262            }
263
264            return match.getString(-1);
265        } catch (IOException e) {
266            return null;
267        }
268    }
269
270
271    /**
272     * Get the names of all charsets supported by <code>CharsetDetector</code> class.
273     * <p>
274     * <b>Note:</b> Multiple different charset encodings in a same family may use
275     * a single shared name in this implementation. For example, this method returns
276     * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
277     * (Windows Latin 1). However, actual detection result could be "windows-1252"
278     * when the input data matches Latin 1 code points with any points only available
279     * in "windows-1252".
280     *
281     * @return an array of the names of all charsets supported by
282     * <code>CharsetDetector</code> class.
283     */
284    public static String[] getAllDetectableCharsets() {
285        String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
286        for (int i = 0; i < allCharsetNames.length; i++) {
287            allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
288        }
289        return allCharsetNames;
290    }
291
292    /**
293     * Test whether or not input filtering is enabled.
294     *
295     * @return <code>true</code> if input text will be filtered.
296     *
297     * @see #enableInputFilter
298     */
299    public boolean inputFilterEnabled()
300    {
301        return fStripTags;
302    }
303
304    /**
305     * Enable filtering of input text. If filtering is enabled,
306     * text within angle brackets ("<" and ">") will be removed
307     * before detection.
308     *
309     * @param filter <code>true</code> to enable input text filtering.
310     *
311     * @return The previous setting.
312     */
313    public boolean enableInputFilter(boolean filter)
314    {
315        boolean previous = fStripTags;
316
317        fStripTags = filter;
318
319        return previous;
320    }
321
322    /*
323     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
324     *               it by removing what appears to be html markup.
325     */
326    private void MungeInput() {
327        int srci = 0;
328        int dsti = 0;
329        byte b;
330        boolean  inMarkup = false;
331        int      openTags = 0;
332        int      badTags  = 0;
333
334        //
335        //  html / xml markup stripping.
336        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
337        //     discard everything within < brackets >
338        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
339        //     guess as to whether the input was actually marked up at all.
340        if (fStripTags) {
341            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
342                b = fRawInput[srci];
343                if (b == (byte)'<') {
344                    if (inMarkup) {
345                        badTags++;
346                    }
347                    inMarkup = true;
348                    openTags++;
349                }
350
351                if (! inMarkup) {
352                    fInputBytes[dsti++] = b;
353                }
354
355                if (b == (byte)'>') {
356                    inMarkup = false;
357                }
358            }
359
360            fInputLen = dsti;
361        }
362
363        //
364        //  If it looks like this input wasn't marked up, or if it looks like it's
365        //    essentially nothing but markup abandon the markup stripping.
366        //    Detection will have to work on the unstripped input.
367        //
368        if (openTags<5 || openTags/5 < badTags ||
369                (fInputLen < 100 && fRawLength>600)) {
370            int limit = fRawLength;
371
372            if (limit > kBufSize) {
373                limit = kBufSize;
374            }
375
376            for (srci=0; srci<limit; srci++) {
377                fInputBytes[srci] = fRawInput[srci];
378            }
379            fInputLen = srci;
380        }
381
382        //
383        // Tally up the byte occurence statistics.
384        //   These are available for use by the various detectors.
385        //
386        Arrays.fill(fByteStats, (short)0);
387        for (srci=0; srci<fInputLen; srci++) {
388            int val = fInputBytes[srci] & 0x00ff;
389            fByteStats[val]++;
390        }
391
392        fC1Bytes = false;
393        for (int i = 0x80; i <= 0x9F; i += 1) {
394            if (fByteStats[i] != 0) {
395                fC1Bytes = true;
396                break;
397            }
398        }
399     }
400
401    /*
402     *  The following items are accessed by individual CharsetRecongizers during
403     *     the recognition process
404     *
405     */
406    byte[]      fInputBytes =       // The text to be checked.  Markup will have been
407                   new byte[kBufSize];  //   removed if appropriate.
408
409    int         fInputLen;          // Length of the byte data in fInputBytes.
410
411    short       fByteStats[] =      // byte frequency statistics for the input text.
412                   new short[256];  //   Value is percent, not absolute.
413                                    //   Value is rounded up, so zero really means zero occurences.
414
415    boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
416                   false;
417
418    String      fDeclaredEncoding;
419
420
421    byte[]               fRawInput;     // Original, untouched input bytes.
422                                        //  If user gave us a byte array, this is it.
423                                        //  If user gave us a stream, it's read to a
424                                        //  buffer here.
425    int                  fRawLength;    // Length of data in fRawInput array.
426
427    InputStream          fInputStream;  // User's input stream, or null if the user
428                                        //   gave us a byte array.
429
430    //
431    //  Stuff private to CharsetDetector
432    //
433    private boolean      fStripTags =   // If true, setText() will strip tags from input text.
434                           false;
435
436    private boolean[]    fEnabledRecognizers;   // If not null, active set of charset recognizers had
437                                                // been changed from the default. The array index is
438                                                // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
439
440    private static class CSRecognizerInfo {
441        CharsetRecognizer recognizer;
442        boolean isDefaultEnabled;
443
444        CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
445            this.recognizer = recognizer;
446            this.isDefaultEnabled = isDefaultEnabled;
447        }
448    }
449
450    /*
451     * List of recognizers for all charsets known to the implementation.
452     */
453    private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
454
455    static {
456        List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>();
457
458        list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
459        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
460        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
461        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
462        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
463
464        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
465        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
466        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
467        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
468        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
469        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
470        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
471        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
472
473        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
474        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
475        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
476        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
477        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
478        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
479        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
480        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
481        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
482        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
483        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
484
485        // IBM 420/424 recognizers are disabled by default
486        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
487        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
488        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
489        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
490
491        ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
492    }
493
494    /**
495     * Get the names of charsets that can be recognized by this CharsetDetector instance.
496     *
497     * @return an array of the names of charsets that can be recognized by this CharsetDetector
498     * instance.
499     *
500     * @deprecated This API is ICU internal only.
501     * @hide draft / provisional / internal are hidden on Android
502     */
503    @Deprecated
504    public String[] getDetectableCharsets() {
505        List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size());
506        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
507            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
508            boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
509            if (active) {
510                csnames.add(rcinfo.recognizer.getName());
511            }
512        }
513        return csnames.toArray(new String[csnames.size()]);
514    }
515
516    /**
517     * Enable or disable individual charset encoding.
518     * A name of charset encoding must be included in the names returned by
519     * {@link #getAllDetectableCharsets()}.
520     *
521     * @param encoding the name of charset encoding.
522     * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
523     * charset encoding.
524     * @return A reference to this <code>CharsetDetector</code>.
525     * @throws IllegalArgumentException when the name of charset encoding is
526     * not supported.
527     *
528     * @deprecated This API is ICU internal only.
529     * @hide draft / provisional / internal are hidden on Android
530     */
531    @Deprecated
532    public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
533        int modIdx = -1;
534        boolean isDefaultVal = false;
535        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
536            CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
537            if (csrinfo.recognizer.getName().equals(encoding)) {
538                modIdx = i;
539                isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
540                break;
541            }
542        }
543        if (modIdx < 0) {
544            // No matching encoding found
545            throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
546        }
547
548        if (fEnabledRecognizers == null && !isDefaultVal) {
549            // Create an array storing the non default setting
550            fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
551
552            // Initialize the array with default info
553            for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
554                fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
555            }
556        }
557
558        if (fEnabledRecognizers != null) {
559            fEnabledRecognizers[modIdx] = enabled;
560        }
561
562        return this;
563    }
564}
565