CharsetDetector.java revision bfab1e7fec36dff93fb980c546ad64a565faf9fc
1/* GENERATED SOURCE. DO NOT MODIFY. */
2/**
3*******************************************************************************
4* Copyright (C) 2005-2016, International Business Machines Corporation and    *
5* others. All Rights Reserved.                                                *
6*******************************************************************************
7*/
8package android.icu.text;
9
10import java.io.IOException;
11import java.io.InputStream;
12import java.io.Reader;
13import java.util.ArrayList;
14import java.util.Arrays;
15import java.util.Collections;
16import java.util.List;
17
18
19/**
20 * <code>CharsetDetector</code> provides a facility for detecting the
21 * charset or encoding of character data in an unknown format.
22 * The input data can either be from an input stream or an array of bytes.
23 * The result of the detection operation is a list of possibly matching
24 * charsets, or, for simple use, you can just ask for a Java Reader that
25 * will will work over the input data.
26 * <p>
27 * Character set detection is at best an imprecise operation.  The detection
28 * process will attempt to identify the charset that best matches the characteristics
29 * of the byte data, but the process is partly statistical in nature, and
30 * the results can not be guaranteed to always be correct.
31 * <p>
32 * For best accuracy in charset detection, the input data should be primarily
33 * in a single language, and a minimum of a few hundred bytes worth of plain text
34 * in the language are needed.  The detection process will attempt to
35 * ignore html or xml style markup that could otherwise obscure the content.
36 * <p>
37 * @hide Only a subset of ICU is exposed in Android
38 */
39public class CharsetDetector {
40
41//   Question: Should we have getters corresponding to the setters for input text
42//   and declared encoding?
43
44//   A thought: If we were to create our own type of Java Reader, we could defer
45//   figuring out an actual charset for data that starts out with too much English
46//   only ASCII until the user actually read through to something that didn't look
47//   like 7 bit English.  If  nothing else ever appeared, we would never need to
48//   actually choose the "real" charset.  All assuming that the application just
49//   wants the data, and doesn't care about a char set name.
50
51    /**
52     *   Constructor
53     */
54    public CharsetDetector() {
55    }
56
57    /**
58     * Set the declared encoding for charset detection.
59     *  The declared encoding of an input text is an encoding obtained
60     *  from an http header or xml declaration or similar source that
61     *  can be provided as additional information to the charset detector.
62     *  A match between a declared encoding and a possible detected encoding
63     *  will raise the quality of that detected encoding by a small delta,
64     *  and will also appear as a "reason" for the match.
65     * <p>
66     * A declared encoding that is incompatible with the input data being
67     * analyzed will not be added to the list of possible encodings.
68     *
69     *  @param encoding The declared encoding
70     */
71    public CharsetDetector setDeclaredEncoding(String encoding) {
72        fDeclaredEncoding = encoding;
73        return this;
74    }
75
76    /**
77     * Set the input text (byte) data whose charset is to be detected.
78     *
79     * @param in the input text of unknown encoding
80     *
81     * @return This CharsetDetector
82     */
83    public CharsetDetector setText(byte [] in) {
84        fRawInput  = in;
85        fRawLength = in.length;
86
87        return this;
88    }
89
90    private static final int kBufSize = 8000;
91
92    /**
93     * Set the input text (byte) data whose charset is to be detected.
94     *  <p>
95     *   The input stream that supplies the character data must have markSupported()
96     *   == true; the charset detection process will read a small amount of data,
97     *   then return the stream to its original position via
98     *   the InputStream.reset() operation.  The exact amount that will
99     *   be read depends on the characteristics of the data itself.
100     *
101     * @param in the input text of unknown encoding
102     *
103     * @return This CharsetDetector
104     */
105
106    public CharsetDetector setText(InputStream in) throws IOException {
107        fInputStream = in;
108        fInputStream.mark(kBufSize);
109        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
110                                          //   previous one may have come from the caller,
111                                          //   in which case we can't touch it.
112        fRawLength = 0;
113        int remainingLength = kBufSize;
114        while (remainingLength > 0 ) {
115            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
116            int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
117            if (bytesRead <= 0) {
118                 break;
119            }
120            fRawLength += bytesRead;
121            remainingLength -= bytesRead;
122        }
123        fInputStream.reset();
124
125        return this;
126    }
127
128
129    /**
130     * Return the charset that best matches the supplied input data.
131     *
132     * Note though, that because the detection
133     * only looks at the start of the input data,
134     * there is a possibility that the returned charset will fail to handle
135     * the full set of input data.
136     * <p>
137     * Raise an exception if
138     *  <ul>
139     *    <li>no charset appears to match the data.</li>
140     *    <li>no input text has been provided</li>
141     *  </ul>
142     *
143     * @return a CharsetMatch object representing the best matching charset, or
144     *         <code>null</code> if there are no matches.
145     */
146    public CharsetMatch detect() {
147//   TODO:  A better implementation would be to copy the detect loop from
148//          detectAll(), and cut it short as soon as a match with a high confidence
149//          is found.  This is something to be done later, after things are otherwise
150//          working.
151        CharsetMatch matches[] = detectAll();
152
153        if (matches == null || matches.length == 0) {
154            return null;
155        }
156
157        return matches[0];
158     }
159
160    /**
161     *  Return an array of all charsets that appear to be plausible
162     *  matches with the input data.  The array is ordered with the
163     *  best quality match first.
164     * <p>
165     * Raise an exception if
166     *  <ul>
167     *    <li>no charsets appear to match the input data.</li>
168     *    <li>no input text has been provided</li>
169     *  </ul>
170     *
171     * @return An array of CharsetMatch objects representing possibly matching charsets.
172     */
173    public CharsetMatch[] detectAll() {
174        ArrayList<CharsetMatch>         matches = new ArrayList<CharsetMatch>();
175
176        MungeInput();  // Strip html markup, collect byte stats.
177
178        //  Iterate over all possible charsets, remember all that
179        //    give a match quality > 0.
180        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
181            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
182            boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
183            if (active) {
184                CharsetMatch m = rcinfo.recognizer.match(this);
185                if (m != null) {
186                    matches.add(m);
187                }
188            }
189        }
190        Collections.sort(matches);      // CharsetMatch compares on confidence
191        Collections.reverse(matches);   //  Put best match first.
192        CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
193        resultArray = matches.toArray(resultArray);
194        return resultArray;
195    }
196
197
198    /**
199     * Autodetect the charset of an inputStream, and return a Java Reader
200     * to access the converted input data.
201     * <p>
202     * This is a convenience method that is equivalent to
203     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
204     * <p>
205     *   For the input stream that supplies the character data, markSupported()
206     *   must be true; the  charset detection will read a small amount of data,
207     *   then return the stream to its original position via
208     *   the InputStream.reset() operation.  The exact amount that will
209     *    be read depends on the characteristics of the data itself.
210     *<p>
211     * Raise an exception if no charsets appear to match the input data.
212     *
213     * @param in The source of the byte data in the unknown charset.
214     *
215     * @param declaredEncoding  A declared encoding for the data, if available,
216     *           or null or an empty string if none is available.
217     */
218    public Reader getReader(InputStream in, String declaredEncoding) {
219        fDeclaredEncoding = declaredEncoding;
220
221        try {
222            setText(in);
223
224            CharsetMatch match = detect();
225
226            if (match == null) {
227                return null;
228            }
229
230            return match.getReader();
231        } catch (IOException e) {
232            return null;
233        }
234    }
235
236    /**
237     * Autodetect the charset of an inputStream, and return a String
238     * containing the converted input data.
239     * <p>
240     * This is a convenience method that is equivalent to
241     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
242     *<p>
243     * Raise an exception if no charsets appear to match the input data.
244     *
245     * @param in The source of the byte data in the unknown charset.
246     *
247     * @param declaredEncoding  A declared encoding for the data, if available,
248     *           or null or an empty string if none is available.
249     */
250    public String getString(byte[] in, String declaredEncoding)
251    {
252        fDeclaredEncoding = declaredEncoding;
253
254        try {
255            setText(in);
256
257            CharsetMatch match = detect();
258
259            if (match == null) {
260                return null;
261            }
262
263            return match.getString(-1);
264        } catch (IOException e) {
265            return null;
266        }
267    }
268
269
270    /**
271     * Get the names of all charsets supported by <code>CharsetDetector</code> class.
272     * <p>
273     * <b>Note:</b> Multiple different charset encodings in a same family may use
274     * a single shared name in this implementation. For example, this method returns
275     * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
276     * (Windows Latin 1). However, actual detection result could be "windows-1252"
277     * when the input data matches Latin 1 code points with any points only available
278     * in "windows-1252".
279     *
280     * @return an array of the names of all charsets supported by
281     * <code>CharsetDetector</code> class.
282     */
283    public static String[] getAllDetectableCharsets() {
284        String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
285        for (int i = 0; i < allCharsetNames.length; i++) {
286            allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
287        }
288        return allCharsetNames;
289    }
290
291    /**
292     * Test whether or not input filtering is enabled.
293     *
294     * @return <code>true</code> if input text will be filtered.
295     *
296     * @see #enableInputFilter
297     */
298    public boolean inputFilterEnabled()
299    {
300        return fStripTags;
301    }
302
303    /**
304     * Enable filtering of input text. If filtering is enabled,
305     * text within angle brackets ("&lt;" and "&gt;") will be removed
306     * before detection.
307     *
308     * @param filter <code>true</code> to enable input text filtering.
309     *
310     * @return The previous setting.
311     */
312    public boolean enableInputFilter(boolean filter)
313    {
314        boolean previous = fStripTags;
315
316        fStripTags = filter;
317
318        return previous;
319    }
320
321    /*
322     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
323     *               it by removing what appears to be html markup.
324     */
325    private void MungeInput() {
326        int srci = 0;
327        int dsti = 0;
328        byte b;
329        boolean  inMarkup = false;
330        int      openTags = 0;
331        int      badTags  = 0;
332
333        //
334        //  html / xml markup stripping.
335        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
336        //     discard everything within < brackets >
337        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
338        //     guess as to whether the input was actually marked up at all.
339        if (fStripTags) {
340            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
341                b = fRawInput[srci];
342                if (b == (byte)'<') {
343                    if (inMarkup) {
344                        badTags++;
345                    }
346                    inMarkup = true;
347                    openTags++;
348                }
349
350                if (! inMarkup) {
351                    fInputBytes[dsti++] = b;
352                }
353
354                if (b == (byte)'>') {
355                    inMarkup = false;
356                }
357            }
358
359            fInputLen = dsti;
360        }
361
362        //
363        //  If it looks like this input wasn't marked up, or if it looks like it's
364        //    essentially nothing but markup abandon the markup stripping.
365        //    Detection will have to work on the unstripped input.
366        //
367        if (openTags<5 || openTags/5 < badTags ||
368                (fInputLen < 100 && fRawLength>600)) {
369            int limit = fRawLength;
370
371            if (limit > kBufSize) {
372                limit = kBufSize;
373            }
374
375            for (srci=0; srci<limit; srci++) {
376                fInputBytes[srci] = fRawInput[srci];
377            }
378            fInputLen = srci;
379        }
380
381        //
382        // Tally up the byte occurence statistics.
383        //   These are available for use by the various detectors.
384        //
385        Arrays.fill(fByteStats, (short)0);
386        for (srci=0; srci<fInputLen; srci++) {
387            int val = fInputBytes[srci] & 0x00ff;
388            fByteStats[val]++;
389        }
390
391        fC1Bytes = false;
392        for (int i = 0x80; i <= 0x9F; i += 1) {
393            if (fByteStats[i] != 0) {
394                fC1Bytes = true;
395                break;
396            }
397        }
398     }
399
400    /*
401     *  The following items are accessed by individual CharsetRecongizers during
402     *     the recognition process
403     *
404     */
405    byte[]      fInputBytes =       // The text to be checked.  Markup will have been
406                   new byte[kBufSize];  //   removed if appropriate.
407
408    int         fInputLen;          // Length of the byte data in fInputBytes.
409
410    short       fByteStats[] =      // byte frequency statistics for the input text.
411                   new short[256];  //   Value is percent, not absolute.
412                                    //   Value is rounded up, so zero really means zero occurences.
413
414    boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
415                   false;
416
417    String      fDeclaredEncoding;
418
419
420    byte[]               fRawInput;     // Original, untouched input bytes.
421                                        //  If user gave us a byte array, this is it.
422                                        //  If user gave us a stream, it's read to a
423                                        //  buffer here.
424    int                  fRawLength;    // Length of data in fRawInput array.
425
426    InputStream          fInputStream;  // User's input stream, or null if the user
427                                        //   gave us a byte array.
428
429    //
430    //  Stuff private to CharsetDetector
431    //
432    private boolean      fStripTags =   // If true, setText() will strip tags from input text.
433                           false;
434
435    private boolean[]    fEnabledRecognizers;   // If not null, active set of charset recognizers had
436                                                // been changed from the default. The array index is
437                                                // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
438
439    private static class CSRecognizerInfo {
440        CharsetRecognizer recognizer;
441        boolean isDefaultEnabled;
442
443        CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
444            this.recognizer = recognizer;
445            this.isDefaultEnabled = isDefaultEnabled;
446        }
447    }
448
449    /*
450     * List of recognizers for all charsets known to the implementation.
451     */
452    private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
453
454    static {
455        List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>();
456
457        list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
458        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
459        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
460        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
461        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
462
463        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
464        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
465        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
466        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
467        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
468        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
469        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
470        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
471
472        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
473        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
474        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
475        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
476        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
477        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
478        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
479        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
480        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
481        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
482        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
483
484        // IBM 420/424 recognizers are disabled by default
485        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
486        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
487        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
488        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
489
490        ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
491    }
492
493    /**
494     * Get the names of charsets that can be recognized by this CharsetDetector instance.
495     *
496     * @return an array of the names of charsets that can be recognized by this CharsetDetector
497     * instance.
498     *
499     * @deprecated This API is ICU internal only.
500     * @hide draft / provisional / internal are hidden on Android
501     */
502    @Deprecated
503    public String[] getDetectableCharsets() {
504        List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size());
505        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
506            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
507            boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
508            if (active) {
509                csnames.add(rcinfo.recognizer.getName());
510            }
511        }
512        return csnames.toArray(new String[csnames.size()]);
513    }
514
515    /**
516     * Enable or disable individual charset encoding.
517     * A name of charset encoding must be included in the names returned by
518     * {@link #getAllDetectableCharsets()}.
519     *
520     * @param encoding the name of charset encoding.
521     * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
522     * charset encoding.
523     * @return A reference to this <code>CharsetDetector</code>.
524     * @throws IllegalArgumentException when the name of charset encoding is
525     * not supported.
526     *
527     * @deprecated This API is ICU internal only.
528     * @hide draft / provisional / internal are hidden on Android
529     */
530    @Deprecated
531    public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
532        int modIdx = -1;
533        boolean isDefaultVal = false;
534        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
535            CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
536            if (csrinfo.recognizer.getName().equals(encoding)) {
537                modIdx = i;
538                isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
539                break;
540            }
541        }
542        if (modIdx < 0) {
543            // No matching encoding found
544            throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
545        }
546
547        if (fEnabledRecognizers == null && !isDefaultVal) {
548            // Create an array storing the non default setting
549            fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
550
551            // Initialize the array with default info
552            for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
553                fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
554            }
555        }
556
557        if (fEnabledRecognizers != null) {
558            fEnabledRecognizers[modIdx] = enabled;
559        }
560
561        return this;
562    }
563}
564