1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2005 - 2012, International Business Machines Corporation and * 6* others. All Rights Reserved. * 7******************************************************************************* 8*/ 9package com.ibm.icu.text; 10 11/** 12 * class CharsetRecog_2022 part of the ICU charset detection imlementation. 13 * This is a superclass for the individual detectors for 14 * each of the detectable members of the ISO 2022 family 15 * of encodings. 16 * 17 * The separate classes are nested within this class. 18 */ 19abstract class CharsetRecog_2022 extends CharsetRecognizer { 20 21 22 /** 23 * Matching function shared among the 2022 detectors JP, CN and KR 24 * Counts up the number of legal an unrecognized escape sequences in 25 * the sample of text, and computes a score based on the total number & 26 * the proportion that fit the encoding. 27 * 28 * 29 * @param text the byte buffer containing text to analyse 30 * @param textLen the size of the text in the byte. 31 * @param escapeSequences the byte escape sequences to test for. 32 * @return match quality, in the range of 0-100. 33 */ 34 int match(byte [] text, int textLen, byte [][] escapeSequences) { 35 int i, j; 36 int escN; 37 int hits = 0; 38 int misses = 0; 39 int shifts = 0; 40 int quality; 41 scanInput: 42 for (i=0; i<textLen; i++) { 43 if (text[i] == 0x1b) { 44 checkEscapes: 45 for (escN=0; escN<escapeSequences.length; escN++) { 46 byte [] seq = escapeSequences[escN]; 47 48 if ((textLen - i) < seq.length) { 49 continue checkEscapes; 50 } 51 52 for (j=1; j<seq.length; j++) { 53 if (seq[j] != text[i+j]) { 54 continue checkEscapes; 55 } 56 } 57 58 hits++; 59 i += seq.length-1; 60 continue scanInput; 61 } 62 63 misses++; 64 } 65 66 if (text[i] == 0x0e || text[i] == 0x0f) { 67 // Shift in/out 68 shifts++; 69 } 70 } 71 72 if (hits == 0) { 73 return 0; 74 } 75 76 // 77 // Initial quality is based on relative proportion of recongized vs. 78 // unrecognized escape sequences. 79 // All good: quality = 100; 80 // half or less good: quality = 0; 81 // linear inbetween. 82 quality = (100*hits - 100*misses) / (hits + misses); 83 84 // Back off quality if there were too few escape sequences seen. 85 // Include shifts in this computation, so that KR does not get penalized 86 // for having only a single Escape sequence, but many shifts. 87 if (hits+shifts < 5) { 88 quality -= (5-(hits+shifts))*10; 89 } 90 91 if (quality < 0) { 92 quality = 0; 93 } 94 return quality; 95 } 96 97 98 99 100 static class CharsetRecog_2022JP extends CharsetRecog_2022 { 101 private byte [] [] escapeSequences = { 102 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 103 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 104 {0x1b, 0x24, 0x40}, // JIS C 6226-1978 105 {0x1b, 0x24, 0x41}, // GB 2312-80 106 {0x1b, 0x24, 0x42}, // JIS X 208-1983 107 {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 108 {0x1b, 0x28, 0x42}, // ASCII 109 {0x1b, 0x28, 0x48}, // JIS-Roman 110 {0x1b, 0x28, 0x49}, // Half-width katakana 111 {0x1b, 0x28, 0x4a}, // JIS-Roman 112 {0x1b, 0x2e, 0x41}, // ISO 8859-1 113 {0x1b, 0x2e, 0x46} // ISO 8859-7 114 }; 115 116 @Override 117 String getName() { 118 return "ISO-2022-JP"; 119 } 120 121 @Override 122 CharsetMatch match(CharsetDetector det) { 123 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 124 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 125 } 126 } 127 128 static class CharsetRecog_2022KR extends CharsetRecog_2022 { 129 private byte [] [] escapeSequences = { 130 {0x1b, 0x24, 0x29, 0x43} 131 }; 132 133 @Override 134 String getName() { 135 return "ISO-2022-KR"; 136 } 137 138 @Override 139 CharsetMatch match(CharsetDetector det) { 140 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 141 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 142 } 143 } 144 145 static class CharsetRecog_2022CN extends CharsetRecog_2022 { 146 private byte [] [] escapeSequences = { 147 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 148 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 149 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 150 {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 151 {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 152 {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 153 {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 154 {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 155 {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 156 {0x1b, 0x4e}, // SS2 157 {0x1b, 0x4f}, // SS3 158 }; 159 160 @Override 161 String getName() { 162 return "ISO-2022-CN"; 163 } 164 165 @Override 166 CharsetMatch match(CharsetDetector det) { 167 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 168 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 169 } 170 } 171 172} 173 174