12ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* GENERATED SOURCE. DO NOT MODIFY. */ 2f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 3f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 42ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* 52ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller******************************************************************************* 62ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller* Copyright (C) 2005 - 2012, International Business Machines Corporation and * 72ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller* others. All Rights Reserved. * 82ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller******************************************************************************* 92ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller*/ 102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpackage android.icu.text; 112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/** 132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * class CharsetRecog_2022 part of the ICU charset detection imlementation. 142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * This is a superclass for the individual detectors for 152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * each of the detectable members of the ISO 2022 family 162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * of encodings. 17f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * The separate classes are nested within this class. 192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerabstract class CharsetRecog_2022 extends CharsetRecognizer { 212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 22f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Matching function shared among the 2022 detectors JP, CN and KR 252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Counts up the number of legal an unrecognized escape sequences in 262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * the sample of text, and computes a score based on the total number & 272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * the proportion that fit the encoding. 28f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 29f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert * 302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param text the byte buffer containing text to analyse 312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param textLen the size of the text in the byte. 322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param escapeSequences the byte escape sequences to test for. 332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return match quality, in the range of 0-100. 342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int match(byte [] text, int textLen, byte [][] escapeSequences) { 362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int i, j; 372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int escN; 382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int hits = 0; 392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int misses = 0; 402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int shifts = 0; 412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int quality; 422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller scanInput: 432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (i=0; i<textLen; i++) { 442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (text[i] == 0x1b) { 452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller checkEscapes: 462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (escN=0; escN<escapeSequences.length; escN++) { 472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller byte [] seq = escapeSequences[escN]; 48f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if ((textLen - i) < seq.length) { 502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue checkEscapes; 512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 52f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (j=1; j<seq.length; j++) { 542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (seq[j] != text[i+j]) { 552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue checkEscapes; 56f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert } 572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 58f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 59f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert hits++; 602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller i += seq.length-1; 612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue scanInput; 622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 63f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 64f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert misses++; 652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 66f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (text[i] == 0x0e || text[i] == 0x0f) { 682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Shift in/out 692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller shifts++; 702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 72f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (hits == 0) { 742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return 0; 752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 76f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // 782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Initial quality is based on relative proportion of recongized vs. 79f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert // unrecognized escape sequences. 802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // All good: quality = 100; 812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // half or less good: quality = 0; 822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // linear inbetween. 832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller quality = (100*hits - 100*misses) / (hits + misses); 84f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Back off quality if there were too few escape sequences seen. 862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Include shifts in this computation, so that KR does not get penalized 872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // for having only a single Escape sequence, but many shifts. 882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (hits+shifts < 5) { 892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller quality -= (5-(hits+shifts))*10; 902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 91f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (quality < 0) { 932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller quality = 0; 94f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert } 952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return quality; 962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 98f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 99f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 100f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_2022JP extends CharsetRecog_2022 { 1022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private byte [] [] escapeSequences = { 1032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 1042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 1052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x40}, // JIS C 6226-1978 1062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x41}, // GB 2312-80 1072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x42}, // JIS X 208-1983 1082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 1092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x28, 0x42}, // ASCII 1102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x28, 0x48}, // JIS-Roman 1112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x28, 0x49}, // Half-width katakana 1122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x28, 0x4a}, // JIS-Roman 1132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x2e, 0x41}, // ISO 8859-1 1142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x2e, 0x46} // ISO 8859-7 1152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller }; 116f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 117f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String getName() { 1192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ISO-2022-JP"; 1202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 121f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 122f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller CharsetMatch match(CharsetDetector det) { 1242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_2022KR extends CharsetRecog_2022 { 1302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private byte [] [] escapeSequences = { 131f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert {0x1b, 0x24, 0x29, 0x43} 1322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller }; 133f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 134f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String getName() { 1362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ISO-2022-KR"; 1372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 138f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 139f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller CharsetMatch match(CharsetDetector det) { 1412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static class CharsetRecog_2022CN extends CharsetRecog_2022 { 1472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private byte [] [] escapeSequences = { 1482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 1492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 1502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 1512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 1522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 1532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 1542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 1552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 1562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 1572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x4e}, // SS2 1582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller {0x1b, 0x4f}, // SS3 1592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller }; 160f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 161f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String getName() { 1632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return "ISO-2022-CN"; 1642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 165f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 166f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert @Override 1672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller CharsetMatch match(CharsetDetector det) { 1682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 172f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert 1732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller} 1742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 175