1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
6* others. All Rights Reserved.                                                *
7*******************************************************************************
8*/
9package com.ibm.icu.text;
10
11/**
12 *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
13 *                           This is a superclass for the individual detectors for
14 *                           each of the detectable members of the ISO 2022 family
15 *                           of encodings.
16 *
17 *                           The separate classes are nested within this class.
18 */
19abstract class CharsetRecog_2022 extends CharsetRecognizer {
20
21
22    /**
23     * Matching function shared among the 2022 detectors JP, CN and KR
24     * Counts up the number of legal an unrecognized escape sequences in
25     * the sample of text, and computes a score based on the total number &
26     * the proportion that fit the encoding.
27     *
28     *
29     * @param text the byte buffer containing text to analyse
30     * @param textLen  the size of the text in the byte.
31     * @param escapeSequences the byte escape sequences to test for.
32     * @return match quality, in the range of 0-100.
33     */
34    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
35        int     i, j;
36        int     escN;
37        int     hits   = 0;
38        int     misses = 0;
39        int     shifts = 0;
40        int     quality;
41        scanInput:
42            for (i=0; i<textLen; i++) {
43                if (text[i] == 0x1b) {
44                    checkEscapes:
45                        for (escN=0; escN<escapeSequences.length; escN++) {
46                            byte [] seq = escapeSequences[escN];
47
48                            if ((textLen - i) < seq.length) {
49                                continue checkEscapes;
50                            }
51
52                            for (j=1; j<seq.length; j++) {
53                                if (seq[j] != text[i+j])  {
54                                    continue checkEscapes;
55                                }
56                            }
57
58                            hits++;
59                            i += seq.length-1;
60                            continue scanInput;
61                        }
62
63                        misses++;
64                }
65
66                if (text[i] == 0x0e || text[i] == 0x0f) {
67                    // Shift in/out
68                    shifts++;
69                }
70            }
71
72        if (hits == 0) {
73            return 0;
74        }
75
76        //
77        // Initial quality is based on relative proportion of recongized vs.
78        //   unrecognized escape sequences.
79        //   All good:  quality = 100;
80        //   half or less good: quality = 0;
81        //   linear inbetween.
82        quality = (100*hits - 100*misses) / (hits + misses);
83
84        // Back off quality if there were too few escape sequences seen.
85        //   Include shifts in this computation, so that KR does not get penalized
86        //   for having only a single Escape sequence, but many shifts.
87        if (hits+shifts < 5) {
88            quality -= (5-(hits+shifts))*10;
89        }
90
91        if (quality < 0) {
92            quality = 0;
93        }
94        return quality;
95    }
96
97
98
99
100    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
101        private byte [] [] escapeSequences = {
102                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
103                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
104                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
105                {0x1b, 0x24, 0x41},         // GB 2312-80
106                {0x1b, 0x24, 0x42},         // JIS X 208-1983
107                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
108                {0x1b, 0x28, 0x42},         // ASCII
109                {0x1b, 0x28, 0x48},         // JIS-Roman
110                {0x1b, 0x28, 0x49},         // Half-width katakana
111                {0x1b, 0x28, 0x4a},         // JIS-Roman
112                {0x1b, 0x2e, 0x41},         // ISO 8859-1
113                {0x1b, 0x2e, 0x46}          // ISO 8859-7
114                };
115
116        @Override
117        String getName() {
118            return "ISO-2022-JP";
119        }
120
121        @Override
122        CharsetMatch   match(CharsetDetector det) {
123            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
124            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
125        }
126    }
127
128    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
129        private byte [] [] escapeSequences = {
130                {0x1b, 0x24, 0x29, 0x43}
131                 };
132
133        @Override
134        String getName() {
135            return "ISO-2022-KR";
136        }
137
138        @Override
139        CharsetMatch   match(CharsetDetector det) {
140            int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
141            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
142        }
143    }
144
145    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
146        private byte [] [] escapeSequences = {
147                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
148                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
149                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
150                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
151                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
152                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
153                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
154                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
155                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
156                {0x1b, 0x4e},               // SS2
157                {0x1b, 0x4f},               // SS3
158        };
159
160        @Override
161        String getName() {
162            return "ISO-2022-CN";
163        }
164
165        @Override
166        CharsetMatch   match(CharsetDetector det) {
167            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
168            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
169        }
170    }
171
172}
173
174