12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2005 - 2012, International Business Machines Corporation and * 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* others. All Rights Reserved. * 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * class CharsetRecog_2022 part of the ICU charset detection imlementation. 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is a superclass for the individual detectors for 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * each of the detectable members of the ISO 2022 family 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of encodings. 162d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert * 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The separate classes are nested within this class. 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertabstract class CharsetRecog_2022 extends CharsetRecognizer { 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 212d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Matching function shared among the 2022 detectors JP, CN and KR 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Counts up the number of legal an unrecognized escape sequences in 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the sample of text, and computes a score based on the total number & 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the proportion that fit the encoding. 272d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert * 282d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert * 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param text the byte buffer containing text to analyse 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param textLen the size of the text in the byte. 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param escapeSequences the byte escape sequences to test for. 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return match quality, in the range of 0-100. 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int match(byte [] text, int textLen, byte [][] escapeSequences) { 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i, j; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int escN; 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int hits = 0; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int misses = 0; 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int shifts = 0; 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int quality; 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert scanInput: 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (i=0; i<textLen; i++) { 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (text[i] == 0x1b) { 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkEscapes: 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (escN=0; escN<escapeSequences.length; escN++) { 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert byte [] seq = escapeSequences[escN]; 472d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((textLen - i) < seq.length) { 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue checkEscapes; 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 512d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (j=1; j<seq.length; j++) { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (seq[j] != text[i+j]) { 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue checkEscapes; 552d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 572d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 582d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert hits++; 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i += seq.length-1; 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue scanInput; 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 622d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 632d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert misses++; 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 652d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (text[i] == 0x0e || text[i] == 0x0f) { 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Shift in/out 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert shifts++; 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 712d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (hits == 0) { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 752d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Initial quality is based on relative proportion of recongized vs. 782d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert // unrecognized escape sequences. 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All good: quality = 100; 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // half or less good: quality = 0; 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // linear inbetween. 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quality = (100*hits - 100*misses) / (hits + misses); 832d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Back off quality if there were too few escape sequences seen. 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Include shifts in this computation, so that KR does not get penalized 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for having only a single Escape sequence, but many shifts. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (hits+shifts < 5) { 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quality -= (5-(hits+shifts))*10; 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 902d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quality < 0) { 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quality = 0; 932d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return quality; 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 972d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 982d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 992d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static class CharsetRecog_2022JP extends CharsetRecog_2022 { 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private byte [] [] escapeSequences = { 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x40}, // JIS C 6226-1978 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x41}, // GB 2312-80 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x42}, // JIS X 208-1983 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x28, 0x42}, // ASCII 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x28, 0x48}, // JIS-Roman 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x28, 0x49}, // Half-width katakana 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x28, 0x4a}, // JIS-Roman 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x2e, 0x41}, // ISO 8859-1 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x2e, 0x46} // ISO 8859-7 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 1152d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1162d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String getName() { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return "ISO-2022-JP"; 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1202d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1212d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsetMatch match(CharsetDetector det) { 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static class CharsetRecog_2022KR extends CharsetRecog_2022 { 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private byte [] [] escapeSequences = { 1302d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert {0x1b, 0x24, 0x29, 0x43} 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 1322d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1332d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String getName() { 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return "ISO-2022-KR"; 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1372d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1382d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsetMatch match(CharsetDetector det) { 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static class CharsetRecog_2022CN extends CharsetRecog_2022 { 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private byte [] [] escapeSequences = { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x4e}, // SS2 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert {0x1b, 0x4f}, // SS3 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 1592d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1602d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String getName() { 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return "ISO-2022-CN"; 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1642d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1652d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsetMatch match(CharsetDetector det) { 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1712d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 174