CharsetRecog_2022.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/* 2******************************************************************************* 3* Copyright (C) 2005 - 2012, International Business Machines Corporation and * 4* others. All Rights Reserved. * 5******************************************************************************* 6*/ 7package com.ibm.icu.text; 8 9/** 10 * class CharsetRecog_2022 part of the ICU charset detection imlementation. 11 * This is a superclass for the individual detectors for 12 * each of the detectable members of the ISO 2022 family 13 * of encodings. 14 * 15 * The separate classes are nested within this class. 16 */ 17abstract class CharsetRecog_2022 extends CharsetRecognizer { 18 19 20 /** 21 * Matching function shared among the 2022 detectors JP, CN and KR 22 * Counts up the number of legal an unrecognized escape sequences in 23 * the sample of text, and computes a score based on the total number & 24 * the proportion that fit the encoding. 25 * 26 * 27 * @param text the byte buffer containing text to analyse 28 * @param textLen the size of the text in the byte. 29 * @param escapeSequences the byte escape sequences to test for. 30 * @return match quality, in the range of 0-100. 31 */ 32 int match(byte [] text, int textLen, byte [][] escapeSequences) { 33 int i, j; 34 int escN; 35 int hits = 0; 36 int misses = 0; 37 int shifts = 0; 38 int quality; 39 scanInput: 40 for (i=0; i<textLen; i++) { 41 if (text[i] == 0x1b) { 42 checkEscapes: 43 for (escN=0; escN<escapeSequences.length; escN++) { 44 byte [] seq = escapeSequences[escN]; 45 46 if ((textLen - i) < seq.length) { 47 continue checkEscapes; 48 } 49 50 for (j=1; j<seq.length; j++) { 51 if (seq[j] != text[i+j]) { 52 continue checkEscapes; 53 } 54 } 55 56 hits++; 57 i += seq.length-1; 58 continue scanInput; 59 } 60 61 misses++; 62 } 63 64 if (text[i] == 0x0e || text[i] == 0x0f) { 65 // Shift in/out 66 shifts++; 67 } 68 } 69 70 if (hits == 0) { 71 return 0; 72 } 73 74 // 75 // Initial quality is based on relative proportion of recongized vs. 76 // unrecognized escape sequences. 77 // All good: quality = 100; 78 // half or less good: quality = 0; 79 // linear inbetween. 80 quality = (100*hits - 100*misses) / (hits + misses); 81 82 // Back off quality if there were too few escape sequences seen. 83 // Include shifts in this computation, so that KR does not get penalized 84 // for having only a single Escape sequence, but many shifts. 85 if (hits+shifts < 5) { 86 quality -= (5-(hits+shifts))*10; 87 } 88 89 if (quality < 0) { 90 quality = 0; 91 } 92 return quality; 93 } 94 95 96 97 98 static class CharsetRecog_2022JP extends CharsetRecog_2022 { 99 private byte [] [] escapeSequences = { 100 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 101 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 102 {0x1b, 0x24, 0x40}, // JIS C 6226-1978 103 {0x1b, 0x24, 0x41}, // GB 2312-80 104 {0x1b, 0x24, 0x42}, // JIS X 208-1983 105 {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 106 {0x1b, 0x28, 0x42}, // ASCII 107 {0x1b, 0x28, 0x48}, // JIS-Roman 108 {0x1b, 0x28, 0x49}, // Half-width katakana 109 {0x1b, 0x28, 0x4a}, // JIS-Roman 110 {0x1b, 0x2e, 0x41}, // ISO 8859-1 111 {0x1b, 0x2e, 0x46} // ISO 8859-7 112 }; 113 114 String getName() { 115 return "ISO-2022-JP"; 116 } 117 118 CharsetMatch match(CharsetDetector det) { 119 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 120 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 121 } 122 } 123 124 static class CharsetRecog_2022KR extends CharsetRecog_2022 { 125 private byte [] [] escapeSequences = { 126 {0x1b, 0x24, 0x29, 0x43} 127 }; 128 129 String getName() { 130 return "ISO-2022-KR"; 131 } 132 133 CharsetMatch match(CharsetDetector det) { 134 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 135 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 136 } 137 } 138 139 static class CharsetRecog_2022CN extends CharsetRecog_2022 { 140 private byte [] [] escapeSequences = { 141 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 142 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 143 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 144 {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 145 {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 146 {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 147 {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 148 {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 149 {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 150 {0x1b, 0x4e}, // SS2 151 {0x1b, 0x4f}, // SS3 152 }; 153 154 String getName() { 155 return "ISO-2022-CN"; 156 } 157 158 CharsetMatch match(CharsetDetector det) { 159 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 160 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 161 } 162 } 163 164} 165 166