1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "csmatch.h"
13#include "csrmbcs.h"
14
15#include <math.h>
16
17U_NAMESPACE_BEGIN
18
19#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21#define min(x,y) (((x)<(y))?(x):(y))
22
23static const uint16_t commonChars_sjis [] = {
24// TODO:  This set of data comes from the character frequency-
25//        of-occurence analysis tool.  The data needs to be moved
26//        into a resource and loaded from there.
270x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
280x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
290x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
300x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
310x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
320x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33
34static const uint16_t commonChars_euc_jp[] = {
35// TODO:  This set of data comes from the character frequency-
36//        of-occurence analysis tool.  The data needs to be moved
37//        into a resource and loaded from there.
380xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
390xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
400xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
410xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
420xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
430xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
440xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
450xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
460xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
470xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48
49static const uint16_t commonChars_euc_kr[] = {
50// TODO:  This set of data comes from the character frequency-
51//        of-occurence analysis tool.  The data needs to be moved
52//        into a resource and loaded from there.
530xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
540xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
550xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
560xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
570xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
580xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
590xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
600xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
610xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
620xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63
64static const uint16_t commonChars_big5[] = {
65// TODO:  This set of data comes from the character frequency-
66//        of-occurence analysis tool.  The data needs to be moved
67//        into a resource and loaded from there.
680xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
690xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
700xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
710xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
720xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
730xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
740xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
750xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
760xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
770xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78
79static const uint16_t commonChars_gb_18030[] = {
80// TODO:  This set of data comes from the character frequency-
81//        of-occurence analysis tool.  The data needs to be moved
82//        into a resource and loaded from there.
830xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
840xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
850xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
860xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
870xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
880xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
890xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
900xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
910xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
920xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93
94static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
95{
96    int32_t start = 0, end = len-1;
97    int32_t mid = (start+end)/2;
98
99    while(start <= end) {
100        if(array[mid] == value) {
101            return mid;
102        }
103
104        if(array[mid] < value){
105            start = mid+1;
106        } else {
107            end = mid-1;
108        }
109
110        mid = (start+end)/2;
111    }
112
113    return -1;
114}
115
116IteratedChar::IteratedChar() :
117charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
118{
119    // nothing else to do.
120}
121
122/*void IteratedChar::reset()
123{
124    charValue = 0;
125    index     = -1;
126    nextIndex = 0;
127    error     = FALSE;
128    done      = FALSE;
129}*/
130
131int32_t IteratedChar::nextByte(InputText *det)
132{
133    if (nextIndex >= det->fRawLength) {
134        done = TRUE;
135
136        return -1;
137    }
138
139    return det->fRawInput[nextIndex++];
140}
141
142CharsetRecog_mbcs::~CharsetRecog_mbcs()
143{
144    // nothing to do.
145}
146
147int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
148    int32_t singleByteCharCount = 0;
149    int32_t doubleByteCharCount = 0;
150    int32_t commonCharCount     = 0;
151    int32_t badCharCount        = 0;
152    int32_t totalCharCount      = 0;
153    int32_t confidence          = 0;
154    IteratedChar iter;
155
156    while (nextChar(&iter, det)) {
157        totalCharCount++;
158
159        if (iter.error) {
160            badCharCount++;
161        } else {
162            if (iter.charValue <= 0xFF) {
163                singleByteCharCount++;
164            } else {
165                doubleByteCharCount++;
166
167                if (commonChars != 0) {
168                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
169                        commonCharCount += 1;
170                    }
171                }
172            }
173        }
174
175
176        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
177            // Bail out early if the byte data is not matching the encoding scheme.
178            // break detectBlock;
179            return confidence;
180        }
181    }
182
183    if (doubleByteCharCount <= 10 && badCharCount == 0) {
184        // Not many multi-byte chars.
185        if (doubleByteCharCount == 0 && totalCharCount < 10) {
186            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
187            // We don't have enough data to have any confidence.
188            // Statistical analysis of single byte non-ASCII charcters would probably help here.
189            confidence = 0;
190        }
191        else {
192            //   ASCII or ISO file?  It's probably not our encoding,
193            //   but is not incompatible with our encoding, so don't give it a zero.
194            confidence = 10;
195        }
196
197        return confidence;
198    }
199
200    //
201    //  No match if there are too many characters that don't fit the encoding scheme.
202    //    (should we have zero tolerance for these?)
203    //
204    if (doubleByteCharCount < 20*badCharCount) {
205        confidence = 0;
206
207        return confidence;
208    }
209
210    if (commonChars == 0) {
211        // We have no statistics on frequently occuring characters.
212        //  Assess confidence purely on having a reasonable number of
213        //  multi-byte characters (the more the better)
214        confidence = 30 + doubleByteCharCount - 20*badCharCount;
215
216        if (confidence > 100) {
217            confidence = 100;
218        }
219    } else {
220        //
221        // Frequency of occurence statistics exist.
222        //
223
224        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
225        double scaleFactor = 90.0 / maxVal;
226        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
227
228        confidence = min(confidence, 100);
229    }
230
231    if (confidence < 0) {
232        confidence = 0;
233    }
234
235    return confidence;
236}
237
238CharsetRecog_sjis::~CharsetRecog_sjis()
239{
240    // nothing to do
241}
242
243UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
244    it->index = it->nextIndex;
245    it->error = FALSE;
246
247    int32_t firstByte = it->charValue = it->nextByte(det);
248
249    if (firstByte < 0) {
250        return FALSE;
251    }
252
253    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
254        return TRUE;
255    }
256
257    int32_t secondByte = it->nextByte(det);
258    if (secondByte >= 0) {
259        it->charValue = (firstByte << 8) | secondByte;
260    }
261    // else we'll handle the error later.
262
263    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
264        // Illegal second byte value.
265        it->error = TRUE;
266    }
267
268    return TRUE;
269}
270
271UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
272    int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273    results->set(det, this, confidence);
274    return (confidence > 0);
275}
276
277const char *CharsetRecog_sjis::getName() const
278{
279    return "Shift_JIS";
280}
281
282const char *CharsetRecog_sjis::getLanguage() const
283{
284    return "ja";
285}
286
287CharsetRecog_euc::~CharsetRecog_euc()
288{
289    // nothing to do
290}
291
292UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
293    int32_t firstByte  = 0;
294    int32_t secondByte = 0;
295    int32_t thirdByte  = 0;
296
297    it->index = it->nextIndex;
298    it->error = FALSE;
299    firstByte = it->charValue = it->nextByte(det);
300
301    if (firstByte < 0) {
302        // Ran off the end of the input data
303        return FALSE;
304    }
305
306    if (firstByte <= 0x8D) {
307        // single byte char
308        return TRUE;
309    }
310
311    secondByte = it->nextByte(det);
312    if (secondByte >= 0) {
313        it->charValue = (it->charValue << 8) | secondByte;
314    }
315    // else we'll handle the error later.
316
317    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
318        // Two byte Char
319        if (secondByte < 0xA1) {
320            it->error = TRUE;
321        }
322
323        return TRUE;
324    }
325
326    if (firstByte == 0x8E) {
327        // Code Set 2.
328        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
329        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
330        // We don't know which we've got.
331        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
332        //   bytes will look like a well formed 2 byte char.
333        if (secondByte < 0xA1) {
334            it->error = TRUE;
335        }
336
337        return TRUE;
338    }
339
340    if (firstByte == 0x8F) {
341        // Code set 3.
342        // Three byte total char size, two bytes of actual char value.
343        thirdByte    = it->nextByte(det);
344        it->charValue = (it->charValue << 8) | thirdByte;
345
346        if (thirdByte < 0xa1) {
347            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
348            it->error = TRUE;
349        }
350    }
351
352    return TRUE;
353
354}
355
356CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
357{
358    // nothing to do
359}
360
361const char *CharsetRecog_euc_jp::getName() const
362{
363    return "EUC-JP";
364}
365
366const char *CharsetRecog_euc_jp::getLanguage() const
367{
368    return "ja";
369}
370
371UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
372{
373    int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
374    results->set(det, this, confidence);
375    return (confidence > 0);
376}
377
378CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
379{
380    // nothing to do
381}
382
383const char *CharsetRecog_euc_kr::getName() const
384{
385    return "EUC-KR";
386}
387
388const char *CharsetRecog_euc_kr::getLanguage() const
389{
390    return "ko";
391}
392
393UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
394{
395    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
396    results->set(det, this, confidence);
397    return (confidence > 0);
398}
399
400CharsetRecog_big5::~CharsetRecog_big5()
401{
402    // nothing to do
403}
404
405UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
406{
407    int32_t firstByte;
408
409    it->index = it->nextIndex;
410    it->error = FALSE;
411    firstByte = it->charValue = it->nextByte(det);
412
413    if (firstByte < 0) {
414        return FALSE;
415    }
416
417    if (firstByte <= 0x7F || firstByte == 0xFF) {
418        // single byte character.
419        return TRUE;
420    }
421
422    int32_t secondByte = it->nextByte(det);
423    if (secondByte >= 0)  {
424        it->charValue = (it->charValue << 8) | secondByte;
425    }
426    // else we'll handle the error later.
427
428    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
429        it->error = TRUE;
430    }
431
432    return TRUE;
433}
434
435const char *CharsetRecog_big5::getName() const
436{
437    return "Big5";
438}
439
440const char *CharsetRecog_big5::getLanguage() const
441{
442    return "zh";
443}
444
445UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
446{
447    int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
448    results->set(det, this, confidence);
449    return (confidence > 0);
450}
451
452CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
453{
454    // nothing to do
455}
456
457UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
458    int32_t firstByte  = 0;
459    int32_t secondByte = 0;
460    int32_t thirdByte  = 0;
461    int32_t fourthByte = 0;
462
463    it->index = it->nextIndex;
464    it->error = FALSE;
465    firstByte = it->charValue = it->nextByte(det);
466
467    if (firstByte < 0) {
468        // Ran off the end of the input data
469        return FALSE;
470    }
471
472    if (firstByte <= 0x80) {
473        // single byte char
474        return TRUE;
475    }
476
477    secondByte = it->nextByte(det);
478    if (secondByte >= 0) {
479        it->charValue = (it->charValue << 8) | secondByte;
480    }
481    // else we'll handle the error later.
482
483    if (firstByte >= 0x81 && firstByte <= 0xFE) {
484        // Two byte Char
485        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
486            return TRUE;
487        }
488
489        // Four byte char
490        if (secondByte >= 0x30 && secondByte <= 0x39) {
491            thirdByte = it->nextByte(det);
492
493            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
494                fourthByte = it->nextByte(det);
495
496                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
497                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
498
499                    return TRUE;
500                }
501            }
502        }
503
504        // Something wasn't valid, or we ran out of data (-1).
505        it->error = TRUE;
506    }
507
508    return TRUE;
509}
510
511const char *CharsetRecog_gb_18030::getName() const
512{
513    return "GB18030";
514}
515
516const char *CharsetRecog_gb_18030::getLanguage() const
517{
518    return "zh";
519}
520
521UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
522{
523    int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
524    results->set(det, this, confidence);
525    return (confidence > 0);
526}
527
528U_NAMESPACE_END
529#endif
530