12ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* GENERATED SOURCE. DO NOT MODIFY. */ 2f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 3f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 42ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* 52ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller **************************************************************************** 62ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Copyright (C) 2005-2012, International Business Machines Corporation and * 72ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * others. All Rights Reserved. * 82ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller **************************************************************************** 92ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpackage android.icu.text; 122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.Arrays; 142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/** 162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets. 172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Match is determined mostly by the input data adhering to the 182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * encoding scheme for the charset, and, optionally, 192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * frequency-of-occurence of characters. 202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <p/> 212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Instances of this class are singletons, one per encoding 222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * being recognized. They are created in the main 232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * CharsetDetector class and kept in the global list of available 242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * encodings to be checked. The specific encoding being recognized 252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * is determined by subclass. 262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerabstract class CharsetRecog_mbcs extends CharsetRecognizer { 282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Get the IANA name of this charset. 312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return the charset name. 322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 33f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller abstract String getName() ; 35f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 36f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Test the match of this charset with the input text data 392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * which is obtained via the CharsetDetector object. 40f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param det The CharsetDetector, which contains the input text 422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * to be checked for being in this charset. 432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return Two values packed into one int (Damn java, anyhow) 442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <br/> 452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * bits 0-7: the match confidence, ranging from 0-100 462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <br/> 472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * bits 8-15: The match reason, an enum-like value. 482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int match(CharsetDetector det, int [] commonChars) { 502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller @SuppressWarnings("unused") 512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int singleByteCharCount = 0; //TODO Do we really need this? 522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int doubleByteCharCount = 0; 532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int commonCharCount = 0; 542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int badCharCount = 0; 552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int totalCharCount = 0; 562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = 0; 572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller iteratedChar iter = new iteratedChar(); 58f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller detectBlock: { 602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (iter.reset(); nextChar(iter, det);) { 612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller totalCharCount++; 622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (iter.error) { 63f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert badCharCount++; 642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } else { 652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller long cv = iter.charValue & 0xFFFFFFFFL; 66f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (cv <= 0xff) { 682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller singleByteCharCount++; 692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } else { 702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller doubleByteCharCount++; 712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (commonChars != null) { 722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // NOTE: This assumes that there are no 4-byte common chars. 732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (Arrays.binarySearch(commonChars, (int) cv) >= 0) { 742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller commonCharCount++; 752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Bail out early if the byte data is not matching the encoding scheme. 812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break detectBlock; 822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 84f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (doubleByteCharCount <= 10 && badCharCount== 0) { 862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Not many multi-byte chars. 872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (doubleByteCharCount == 0 && totalCharCount < 10) { 882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // We don't have enough data to have any confidence. 902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Statistical analysis of single byte non-ASCII charcters would probably help here. 912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = 0; 922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller else { 942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // ASCII or ISO file? It's probably not our encoding, 952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // but is not incompatible with our encoding, so don't give it a zero. 962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = 10; 972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 98f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break detectBlock; 1002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 101f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // No match if there are too many characters that don't fit the encoding scheme. 1042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // (should we have zero tolerance for these?) 1052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (doubleByteCharCount < 20*badCharCount) { 1072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = 0; 1082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break detectBlock; 1092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 110f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (commonChars == null) { 1122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // We have no statistics on frequently occuring characters. 1132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Assess confidence purely on having a reasonable number of 1142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // multi-byte characters (the more the better 1152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = 30 + doubleByteCharCount - 20*badCharCount; 1162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (confidence > 100) { 1172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = 100; 1182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller }else { 1202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Frequency of occurence statistics exist. 1222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller double maxVal = Math.log((float)doubleByteCharCount / 4); 1242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller double scaleFactor = 90.0 / maxVal; 1252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); 1262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller confidence = Math.min(confidence, 100); 1272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } // end of detectBlock: 129f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence; 1312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 132f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // "Character" iterated character class. 1342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Recognizers for specific mbcs encodings make their "characters" available 1352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // by providing a nextChar() function that fills in an instance of iteratedChar 1362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // with the next char from the input. 1372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // The returned characters are not converted to Unicode, but remain as the raw 1382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // bytes (concatenated into an int) from the codepage data. 1392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // For Asian charsets, use the raw input rather than the input that has been 1412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // stripped of markup. Detection only considers multi-byte chars, effectively 1422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // stripping markup anyway, and double byte chars do occur in markup too. 1432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 1442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class iteratedChar { 1452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int charValue = 0; // 1-4 bytes from the raw input data 1462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int nextIndex = 0; 1472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller boolean error = false; 1482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller boolean done = false; 149f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller void reset() { 1512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller charValue = 0; 1522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller nextIndex = 0; 1532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller error = false; 1542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller done = false; 1552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 156f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int nextByte(CharsetDetector det) { 1582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (nextIndex >= det.fRawLength) { 1592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller done = true; 1602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return -1; 1612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 162f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert int byteValue = det.fRawInput[nextIndex++] & 0x00ff; 1632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return byteValue; 164f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert } 1652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 166f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 1682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Get the next character (however many bytes it is) from the input data 1692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Subclasses for specific charset encodings must implement this function 1702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * to get characters according to the rules of their encoding scheme. 171f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 1722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * This function is not a method of class iteratedChar only because 1732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * that would require a lot of extra derived classes, which is awkward. 1742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param it The iteratedChar "struct" into which the returned char is placed. 1752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param det The charset detector, which is needed to get at the input byte data 1762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * being iterated over. 1772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return True if a character was returned, false at end of input. 1782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 1792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller abstract boolean nextChar(iteratedChar it, CharsetDetector det); 1802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 182f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 183f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 184f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 186f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * Shift-JIS charset recognizer. 1872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 1882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 1892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_sjis extends CharsetRecog_mbcs { 190f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert static int [] commonChars = 1912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: This set of data comes from the character frequency- 1922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // of-occurence analysis tool. The data needs to be moved 1932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // into a resource and loaded from there. 194f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 195f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 196f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 197f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 198f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 1992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 200f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 201f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 202f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert boolean nextChar(iteratedChar it, CharsetDetector det) { 2032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = false; 2042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int firstByte; 2052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller firstByte = it.charValue = it.nextByte(det); 2062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte < 0) { 2072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return false; 2082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 209f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { 2112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return true; 2122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 213f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int secondByte = it.nextByte(det); 2152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte < 0) { 216f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert return false; 2172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (firstByte << 8) | secondByte; 2192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { 2202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Illegal second byte value. 2212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 2222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return true; 2242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 225f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 226f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 227f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert CharsetMatch match(CharsetDetector det) { 2282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det, commonChars); 2292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 2302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 231f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 232f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 233f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert String getName() { 2342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "Shift_JIS"; 2352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 236f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 237f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 238f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert public String getLanguage() 2392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller { 2402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ja"; 2412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 243f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 245f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 246f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 248f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * Big5 charset recognizer. 2492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 2502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 2512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_big5 extends CharsetRecog_mbcs { 252f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert static int [] commonChars = 2532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: This set of data comes from the character frequency- 2542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // of-occurence analysis tool. The data needs to be moved 2552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // into a resource and loaded from there. 256f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 257f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 258f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 259f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 260f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 261f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 262f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 263f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 264f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 2652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 266f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 267f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 268f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert boolean nextChar(iteratedChar it, CharsetDetector det) { 2692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = false; 2702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int firstByte; 2712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller firstByte = it.charValue = it.nextByte(det); 2722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte < 0) { 2732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return false; 2742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 275f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte <= 0x7f || firstByte==0xff) { 2772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // single byte character. 2782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return true; 2792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 280f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 2812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int secondByte = it.nextByte(det); 2822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte < 0) { 283f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert return false; 2842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (it.charValue << 8) | secondByte; 2862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte < 0x40 || 2882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller secondByte ==0x7f || 2892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller secondByte == 0xff) { 2902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 2912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return true; 2932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 294f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 295f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 296f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert CharsetMatch match(CharsetDetector det) { 2972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det, commonChars); 2982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 2992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 300f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 301f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 302f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert String getName() { 3032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "Big5"; 3042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 305f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 306f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 307f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 308f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert public String getLanguage() 3092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller { 3102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "zh"; 3112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 313f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 314f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 3162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * EUC charset recognizers. One abstract class that provides the common function 3172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * for getting the next character according to the EUC encoding scheme, 318f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 3192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 3202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 3212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { 322f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 3242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * (non-Javadoc) 3252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Get the next character value for EUC based encodings. 3262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Character "value" is simply the raw bytes that make up the character 3272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * packed into an int. 3282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 329f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 330f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert boolean nextChar(iteratedChar it, CharsetDetector det) { 3312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = false; 3322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int firstByte = 0; 3332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int secondByte = 0; 3342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int thirdByte = 0; 3352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //int fourthByte = 0; 336f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buildChar: { 338f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert firstByte = it.charValue = it.nextByte(det); 3392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte < 0) { 3402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Ran off the end of the input data 3412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.done = true; 3422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 3432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte <= 0x8d) { 3452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // single byte char 3462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 3472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 348f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller secondByte = it.nextByte(det); 3502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (it.charValue << 8) | secondByte; 351f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte >= 0xA1 && firstByte <= 0xfe) { 3532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Two byte Char 3542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte < 0xa1) { 3552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 3562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 3582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte == 0x8e) { 3602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Code Set 2. 3612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 3622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 3632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // We don't know which we've got. 3642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Treat it like EUC-JP. If the data really was EUC-TW, the following two 365f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert // bytes will look like a well formed 2 byte char. 3662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte < 0xa1) { 3672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 3682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 369f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert break buildChar; 3702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 371f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte == 0x8f) { 3732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Code set 3. 3742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Three byte total char size, two bytes of actual char value. 3752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller thirdByte = it.nextByte(det); 3762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (it.charValue << 8) | thirdByte; 3772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (thirdByte < 0xa1) { 3782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 3792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 382f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return (it.done == false); 3842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 385f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 3862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 3872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * The charset recognize for EUC-JP. A singleton instance of this class 3882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * is created and kept by the public CharsetDetector class 3892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 3902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_euc_jp extends CharsetRecog_euc { 391f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert static int [] commonChars = 3922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: This set of data comes from the character frequency- 3932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // of-occurence analysis tool. The data needs to be moved 3942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // into a resource and loaded from there. 395f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 396f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 397f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 398f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 399f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 400f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 401f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 402f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 403f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 404f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 405f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 406f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert String getName() { 4072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "EUC-JP"; 4082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 409f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 410f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 411f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert CharsetMatch match(CharsetDetector det) { 4122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det, commonChars); 4132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 4142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 415f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 416f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 417f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert public String getLanguage() 4182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller { 4192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ja"; 4202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 422f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 4242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * The charset recognize for EUC-KR. A singleton instance of this class 4252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * is created and kept by the public CharsetDetector class 4262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_euc_kr extends CharsetRecog_euc { 428f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert static int [] commonChars = 4292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: This set of data comes from the character frequency- 4302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // of-occurence analysis tool. The data needs to be moved 4312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // into a resource and loaded from there. 432f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 433f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 434f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 435f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 436f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 437f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 438f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 439f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 440f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 4412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 442f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 443f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 444f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert String getName() { 4452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "EUC-KR"; 4462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 447f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 448f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 449f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert CharsetMatch match(CharsetDetector det) { 4502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det, commonChars); 4512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 4522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 453f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 454f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 455f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert public String getLanguage() 4562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller { 4572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ko"; 4582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 461f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 463f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 464f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * GB-18030 recognizer. Uses simplified Chinese statistics. 4652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 4662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { 468f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 4702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * (non-Javadoc) 4712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Get the next character value for EUC based encodings. 4722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Character "value" is simply the raw bytes that make up the character 4732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * packed into an int. 4742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 475f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 476f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert boolean nextChar(iteratedChar it, CharsetDetector det) { 4772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = false; 4782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int firstByte = 0; 4792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int secondByte = 0; 4802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int thirdByte = 0; 4812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int fourthByte = 0; 482f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buildChar: { 484f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert firstByte = it.charValue = it.nextByte(det); 485f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte < 0) { 4872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Ran off the end of the input data 4882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.done = true; 4892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 4902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 491f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte <= 0x80) { 4932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // single byte char 4942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 4952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 496f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 4972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller secondByte = it.nextByte(det); 4982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (it.charValue << 8) | secondByte; 499f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (firstByte >= 0x81 && firstByte <= 0xFE) { 5012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Two byte Char 5022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { 5032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 5042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 505f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Four byte char 5072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (secondByte >= 0x30 && secondByte <= 0x39) { 5082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller thirdByte = it.nextByte(det); 509f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 5112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller fourthByte = it.nextByte(det); 512f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (fourthByte >= 0x30 && fourthByte <= 0x39) { 5142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; 5152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 5162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 519f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller it.error = true; 5212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break buildChar; 5222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 524f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return (it.done == false); 5262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 527f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 528f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert static int [] commonChars = 5292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: This set of data comes from the character frequency- 5302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // of-occurence analysis tool. The data needs to be moved 5312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // into a resource and loaded from there. 532f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 533f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 534f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 535f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 536f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 537f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 538f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 539f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 540f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 5412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 5422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 543f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 544f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 545f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert String getName() { 5462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "GB18030"; 5472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 548f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 549f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 550f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert CharsetMatch match(CharsetDetector det) { 5512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det, commonChars); 5522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 5532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 554f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 555f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 556f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert public String getLanguage() 5572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller { 5582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "zh"; 5592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 561f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 562f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 5632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller} 564