CharsetRecog_2022.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/*
2*******************************************************************************
3* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
4* others. All Rights Reserved.                                                *
5*******************************************************************************
6*/
7package com.ibm.icu.text;
8
9/**
10 *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
11 *                           This is a superclass for the individual detectors for
12 *                           each of the detectable members of the ISO 2022 family
13 *                           of encodings.
14 *
15 *                           The separate classes are nested within this class.
16 */
17abstract class CharsetRecog_2022 extends CharsetRecognizer {
18
19
20    /**
21     * Matching function shared among the 2022 detectors JP, CN and KR
22     * Counts up the number of legal an unrecognized escape sequences in
23     * the sample of text, and computes a score based on the total number &
24     * the proportion that fit the encoding.
25     *
26     *
27     * @param text the byte buffer containing text to analyse
28     * @param textLen  the size of the text in the byte.
29     * @param escapeSequences the byte escape sequences to test for.
30     * @return match quality, in the range of 0-100.
31     */
32    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
33        int     i, j;
34        int     escN;
35        int     hits   = 0;
36        int     misses = 0;
37        int     shifts = 0;
38        int     quality;
39        scanInput:
40            for (i=0; i<textLen; i++) {
41                if (text[i] == 0x1b) {
42                    checkEscapes:
43                        for (escN=0; escN<escapeSequences.length; escN++) {
44                            byte [] seq = escapeSequences[escN];
45
46                            if ((textLen - i) < seq.length) {
47                                continue checkEscapes;
48                            }
49
50                            for (j=1; j<seq.length; j++) {
51                                if (seq[j] != text[i+j])  {
52                                    continue checkEscapes;
53                                }
54                            }
55
56                            hits++;
57                            i += seq.length-1;
58                            continue scanInput;
59                        }
60
61                        misses++;
62                }
63
64                if (text[i] == 0x0e || text[i] == 0x0f) {
65                    // Shift in/out
66                    shifts++;
67                }
68            }
69
70        if (hits == 0) {
71            return 0;
72        }
73
74        //
75        // Initial quality is based on relative proportion of recongized vs.
76        //   unrecognized escape sequences.
77        //   All good:  quality = 100;
78        //   half or less good: quality = 0;
79        //   linear inbetween.
80        quality = (100*hits - 100*misses) / (hits + misses);
81
82        // Back off quality if there were too few escape sequences seen.
83        //   Include shifts in this computation, so that KR does not get penalized
84        //   for having only a single Escape sequence, but many shifts.
85        if (hits+shifts < 5) {
86            quality -= (5-(hits+shifts))*10;
87        }
88
89        if (quality < 0) {
90            quality = 0;
91        }
92        return quality;
93    }
94
95
96
97
98    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
99        private byte [] [] escapeSequences = {
100                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
101                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
102                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
103                {0x1b, 0x24, 0x41},         // GB 2312-80
104                {0x1b, 0x24, 0x42},         // JIS X 208-1983
105                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
106                {0x1b, 0x28, 0x42},         // ASCII
107                {0x1b, 0x28, 0x48},         // JIS-Roman
108                {0x1b, 0x28, 0x49},         // Half-width katakana
109                {0x1b, 0x28, 0x4a},         // JIS-Roman
110                {0x1b, 0x2e, 0x41},         // ISO 8859-1
111                {0x1b, 0x2e, 0x46}          // ISO 8859-7
112                };
113
114        String getName() {
115            return "ISO-2022-JP";
116        }
117
118        CharsetMatch   match(CharsetDetector det) {
119            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
120            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
121        }
122    }
123
124    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
125        private byte [] [] escapeSequences = {
126                {0x1b, 0x24, 0x29, 0x43}
127                 };
128
129        String getName() {
130            return "ISO-2022-KR";
131        }
132
133        CharsetMatch   match(CharsetDetector det) {
134            int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
135            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
136        }
137    }
138
139    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
140        private byte [] [] escapeSequences = {
141                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
142                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
143                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
144                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
145                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
146                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
147                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
148                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
149                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
150                {0x1b, 0x4e},               // SS2
151                {0x1b, 0x4f},               // SS3
152        };
153
154        String getName() {
155            return "ISO-2022-CN";
156        }
157
158        CharsetMatch   match(CharsetDetector det) {
159            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
160            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
161        }
162    }
163
164}
165
166