12ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* GENERATED SOURCE. DO NOT MODIFY. */
2f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// © 2016 and later: Unicode, Inc. and others.
3f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
42ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/*
52ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller ****************************************************************************
62ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Copyright (C) 2005-2012, International Business Machines Corporation and *
72ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * others. All Rights Reserved.                                             *
82ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller ****************************************************************************
92ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *
102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */
112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpackage android.icu.text;
122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.Arrays;
142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/**
162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   Match is determined mostly by the input data adhering to the
182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   encoding scheme for the charset, and, optionally,
192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   frequency-of-occurence of characters.
202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <p/>
212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   Instances of this class are singletons, one per encoding
222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   being recognized.  They are created in the main
232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   CharsetDetector class and kept in the global list of available
242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   encodings to be checked.  The specific encoding being recognized
252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                   is determined by subclass.
262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */
272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerabstract class CharsetRecog_mbcs extends CharsetRecognizer {
282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller   /**
302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * Get the IANA name of this charset.
312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @return the charset name.
322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     */
33f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert    @Override
342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    abstract String      getName() ;
35f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
36f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    /**
382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * Test the match of this charset with the input text data
392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *      which is obtained via the CharsetDetector object.
40f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert     *
412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @param det  The CharsetDetector, which contains the input text
422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *             to be checked for being in this charset.
432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @return     Two values packed into one int  (Damn java, anyhow)
442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *             <br/>
452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *             bits 0-7:  the match confidence, ranging from 0-100
462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *             <br/>
472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     *             bits 8-15: The match reason, an enum-like value.
482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     */
492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    int match(CharsetDetector det, int [] commonChars) {
502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        @SuppressWarnings("unused")
512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   singleByteCharCount = 0;  //TODO Do we really need this?
522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   doubleByteCharCount = 0;
532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   commonCharCount     = 0;
542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   badCharCount        = 0;
552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   totalCharCount      = 0;
562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int   confidence          = 0;
572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        iteratedChar   iter       = new iteratedChar();
58f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        detectBlock: {
602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            for (iter.reset(); nextChar(iter, det);) {
612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                totalCharCount++;
622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (iter.error) {
63f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                    badCharCount++;
642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                } else {
652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    long cv = iter.charValue & 0xFFFFFFFFL;
66f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    if (cv <= 0xff) {
682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        singleByteCharCount++;
692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    } else {
702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        doubleByteCharCount++;
712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        if (commonChars != null) {
722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            // NOTE: This assumes that there are no 4-byte common chars.
732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                commonCharCount++;
752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            }
762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        }
772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    }
782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    // Bail out early if the byte data is not matching the encoding scheme.
812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    break detectBlock;
822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }
84f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            if (doubleByteCharCount <= 10 && badCharCount== 0) {
862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                // Not many multi-byte chars.
872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (doubleByteCharCount == 0 && totalCharCount < 10) {
882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    // We don't have enough data to have any confidence.
902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    // Statistical analysis of single byte non-ASCII charcters would probably help here.
912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    confidence = 0;
922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                else {
942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    //   ASCII or ISO file?  It's probably not our encoding,
952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    //   but is not incompatible with our encoding, so don't give it a zero.
962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    confidence = 10;
972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
98f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                break detectBlock;
1002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }
101f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            //
1032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            //  No match if there are too many characters that don't fit the encoding scheme.
1042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            //    (should we have zero tolerance for these?)
1052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            //
1062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            if (doubleByteCharCount < 20*badCharCount) {
1072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                confidence = 0;
1082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                break detectBlock;
1092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }
110f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            if (commonChars == null) {
1122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                // We have no statistics on frequently occuring characters.
1132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                //  Assess confidence purely on having a reasonable number of
1142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                //  multi-byte characters (the more the better
1152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                confidence = 30 + doubleByteCharCount - 20*badCharCount;
1162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (confidence > 100) {
1172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    confidence = 100;
1182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
1192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }else {
1202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                //
1212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                // Frequency of occurence statistics exist.
1222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                //
1232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                double maxVal = Math.log((float)doubleByteCharCount / 4);
1242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                double scaleFactor = 90.0 / maxVal;
1252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
1262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                confidence = Math.min(confidence, 100);
1272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }
1282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }   // end of detectBlock:
129f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        return confidence;
1312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    }
132f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     // "Character"  iterated character class.
1342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //    Recognizers for specific mbcs encodings make their "characters" available
1352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //    by providing a nextChar() function that fills in an instance of iteratedChar
1362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //    with the next char from the input.
1372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //    The returned characters are not converted to Unicode, but remain as the raw
1382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //    bytes (concatenated into an int) from the codepage data.
1392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //
1402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //  For Asian charsets, use the raw input rather than the input that has been
1412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //   stripped of markup.  Detection only considers multi-byte chars, effectively
1422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //   stripping markup anyway, and double byte chars do occur in markup too.
1432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     //
1442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     static class iteratedChar {
1452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         int             charValue = 0;             // 1-4 bytes from the raw input data
1462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         int             nextIndex = 0;
1472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         boolean         error     = false;
1482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         boolean         done      = false;
149f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         void reset() {
1512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             charValue = 0;
1522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             nextIndex = 0;
1532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             error     = false;
1542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             done      = false;
1552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
156f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         int nextByte(CharsetDetector det) {
1582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (nextIndex >= det.fRawLength) {
1592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 done = true;
1602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return -1;
1612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
162f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
1632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return byteValue;
164f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         }
1652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     }
166f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     /**
1682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      * Get the next character (however many bytes it is) from the input data
1692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *    Subclasses for specific charset encodings must implement this function
1702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *    to get characters according to the rules of their encoding scheme.
171f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *
1722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *  This function is not a method of class iteratedChar only because
1732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *   that would require a lot of extra derived classes, which is awkward.
1742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      * @param it  The iteratedChar "struct" into which the returned char is placed.
1752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      * @param det The charset detector, which is needed to get at the input byte data
1762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *            being iterated over.
1772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      * @return    True if a character was returned, false at end of input.
1782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      */
1792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     abstract boolean nextChar(iteratedChar it, CharsetDetector det);
1802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
1812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
182f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
183f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
184f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     /**
186f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *   Shift-JIS charset recognizer.
1872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *
1882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      */
1892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     static class CharsetRecog_sjis extends CharsetRecog_mbcs {
190f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         static int [] commonChars =
1912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             // TODO:  This set of data comes from the character frequency-
1922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        of-occurence analysis tool.  The data needs to be moved
1932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        into a resource and loaded from there.
194f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
195f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
196f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
197f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
198f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
1992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
200f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
201f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
202f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        boolean nextChar(iteratedChar it, CharsetDetector det) {
2032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.error = false;
2042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int firstByte;
2052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             firstByte = it.charValue = it.nextByte(det);
2062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (firstByte < 0) {
2072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return false;
2082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
209f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
2112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return true;
2122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
213f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int secondByte = it.nextByte(det);
2152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (secondByte < 0)  {
216f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 return false;
2172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
2182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.charValue = (firstByte << 8) | secondByte;
2192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
2202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 // Illegal second byte value.
2212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 it.error = true;
2222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
2232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return true;
2242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
225f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
226f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
227f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        CharsetMatch match(CharsetDetector det) {
2282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int confidence = match(det, commonChars);
2292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
2302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
231f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
232f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
233f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        String getName() {
2342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "Shift_JIS";
2352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
236f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
237f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
238f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        public String getLanguage()
2392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         {
2402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "ja";
2412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
2422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
243f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     }
245f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
246f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     /**
248f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *   Big5 charset recognizer.
2492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *
2502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      */
2512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     static class CharsetRecog_big5 extends CharsetRecog_mbcs {
252f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         static int [] commonChars =
2532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             // TODO:  This set of data comes from the character frequency-
2542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        of-occurence analysis tool.  The data needs to be moved
2552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        into a resource and loaded from there.
256f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
257f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
258f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
259f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
260f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
261f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
262f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
263f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
264f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
2652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
266f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
267f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
268f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        boolean nextChar(iteratedChar it, CharsetDetector det) {
2692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.error = false;
2702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int firstByte;
2712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             firstByte = it.charValue = it.nextByte(det);
2722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (firstByte < 0) {
2732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return false;
2742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
275f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (firstByte <= 0x7f || firstByte==0xff) {
2772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 // single byte character.
2782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return true;
2792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
280f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
2812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int secondByte = it.nextByte(det);
2822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (secondByte < 0)  {
283f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 return false;
2842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
2852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.charValue = (it.charValue << 8) | secondByte;
2862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
2872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             if (secondByte < 0x40 ||
2882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 secondByte ==0x7f ||
2892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 secondByte == 0xff) {
2902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     it.error = true;
2912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
2922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return true;
2932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
294f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
295f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
296f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        CharsetMatch match(CharsetDetector det) {
2972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int confidence = match(det, commonChars);
2982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
2992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
300f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
301f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
302f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        String getName() {
3032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "Big5";
3042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
305f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
306f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
307f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
308f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        public String getLanguage()
3092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         {
3102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "zh";
3112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
3122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     }
313f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
314f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     /**
3162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *   EUC charset recognizers.  One abstract class that provides the common function
3172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *             for getting the next character according to the EUC encoding scheme,
318f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
3192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *
3202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      */
3212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
322f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         /*
3242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  (non-Javadoc)
3252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  Get the next character value for EUC based encodings.
3262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  Character "value" is simply the raw bytes that make up the character
3272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *     packed into an int.
3282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          */
329f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
330f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        boolean nextChar(iteratedChar it, CharsetDetector det) {
3312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.error = false;
3322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int firstByte  = 0;
3332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int secondByte = 0;
3342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int thirdByte  = 0;
3352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //int fourthByte = 0;
336f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             buildChar: {
338f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 firstByte = it.charValue = it.nextByte(det);
3392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte < 0) {
3402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Ran off the end of the input data
3412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     it.done = true;
3422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
3432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
3442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte <= 0x8d) {
3452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // single byte char
3462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
3472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
348f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 secondByte = it.nextByte(det);
3502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 it.charValue = (it.charValue << 8) | secondByte;
351f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
3532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Two byte Char
3542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     if (secondByte < 0xa1) {
3552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         it.error = true;
3562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     }
3572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
3582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
3592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte == 0x8e) {
3602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Code Set 2.
3612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
3622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
3632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // We don't know which we've got.
3642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
365f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                     //   bytes will look like a well formed 2 byte char.
3662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     if (secondByte < 0xa1) {
3672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         it.error = true;
3682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     }
369f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                     break buildChar;
3702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
371f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte == 0x8f) {
3732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Code set 3.
3742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Three byte total char size, two bytes of actual char value.
3752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     thirdByte    = it.nextByte(det);
3762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     it.charValue = (it.charValue << 8) | thirdByte;
3772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     if (thirdByte < 0xa1) {
3782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         it.error = true;
3792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     }
3802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
3812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller              }
382f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return (it.done == false);
3842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
385f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
3862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         /**
3872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          * The charset recognize for EUC-JP.  A singleton instance of this class
3882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *    is created and kept by the public CharsetDetector class
3892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          */
3902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         static class CharsetRecog_euc_jp extends CharsetRecog_euc {
391f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             static int [] commonChars =
3922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 // TODO:  This set of data comes from the character frequency-
3932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 //        of-occurence analysis tool.  The data needs to be moved
3942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 //        into a resource and loaded from there.
395f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
396f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
397f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
398f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
399f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
400f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
401f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
402f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
403f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
404f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
405f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
406f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            String getName() {
4072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return "EUC-JP";
4082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
409f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
410f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
411f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            CharsetMatch match(CharsetDetector det) {
4122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 int confidence = match(det, commonChars);
4132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
4142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
415f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
416f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
417f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            public String getLanguage()
4182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             {
4192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return "ja";
4202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
4212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
422f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         /**
4242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          * The charset recognize for EUC-KR.  A singleton instance of this class
4252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *    is created and kept by the public CharsetDetector class
4262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          */
4272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         static class CharsetRecog_euc_kr extends CharsetRecog_euc {
428f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             static int [] commonChars =
4292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 // TODO:  This set of data comes from the character frequency-
4302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 //        of-occurence analysis tool.  The data needs to be moved
4312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 //        into a resource and loaded from there.
432f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
433f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
434f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
435f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
436f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
437f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
438f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
439f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
440f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
4412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
442f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
443f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
444f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            String getName() {
4452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return "EUC-KR";
4462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
447f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
448f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
449f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            CharsetMatch match(CharsetDetector det) {
4502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 int confidence = match(det, commonChars);
4512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
4522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
453f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
454f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             @Override
455f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            public String getLanguage()
4562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             {
4572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 return "ko";
4582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
4592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
4602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     }
461f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     /**
463f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *
464f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert      *   GB-18030 recognizer. Uses simplified Chinese statistics.
4652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      *
4662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller      */
4672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
468f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         /*
4702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  (non-Javadoc)
4712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  Get the next character value for EUC based encodings.
4722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *  Character "value" is simply the raw bytes that make up the character
4732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          *     packed into an int.
4742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller          */
475f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
476f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        boolean nextChar(iteratedChar it, CharsetDetector det) {
4772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             it.error = false;
4782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int firstByte  = 0;
4792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int secondByte = 0;
4802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int thirdByte  = 0;
4812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int fourthByte = 0;
482f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             buildChar: {
484f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                 firstByte = it.charValue = it.nextByte(det);
485f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte < 0) {
4872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Ran off the end of the input data
4882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     it.done = true;
4892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
4902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
491f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte <= 0x80) {
4932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // single byte char
4942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
4952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
496f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
4972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 secondByte = it.nextByte(det);
4982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 it.charValue = (it.charValue << 8) | secondByte;
499f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 if (firstByte >= 0x81 && firstByte <= 0xFE) {
5012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Two byte Char
5022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
5032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         break buildChar;
5042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     }
505f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     // Four byte char
5072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     if (secondByte >= 0x30 && secondByte <= 0x39) {
5082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         thirdByte = it.nextByte(det);
509f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
5112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                             fourthByte = it.nextByte(det);
512f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                             if (fourthByte >= 0x30 && fourthByte <= 0x39) {
5142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
5152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                 break buildChar;
5162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                             }
5172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                         }
5182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     }
519f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     it.error = true;
5212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                     break buildChar;
5222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 }
5232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             }
524f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return (it.done == false);
5262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
527f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
528f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         static int [] commonChars =
5292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             // TODO:  This set of data comes from the character frequency-
5302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        of-occurence analysis tool.  The data needs to be moved
5312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             //        into a resource and loaded from there.
532f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert            {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
533f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
534f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
535f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
536f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
537f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
538f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
539f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
540f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert             0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
5412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
5422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
543f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
544f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
545f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        String getName() {
5462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "GB18030";
5472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
548f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
549f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
550f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        CharsetMatch match(CharsetDetector det) {
5512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             int confidence = match(det, commonChars);
5522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
5532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
554f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
555f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert         @Override
556f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        public String getLanguage()
5572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         {
5582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller             return "zh";
5592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller         }
5602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     }
561f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
562f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
5632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller}
564