1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2008, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "csrmbcs.h"
13
14#include <math.h>
15
16U_NAMESPACE_BEGIN
17
18#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20#define min(x,y) (((x)<(y))?(x):(y))
21
22static const uint16_t commonChars_sjis [] = {
23// TODO:  This set of data comes from the character frequency-
24//        of-occurence analysis tool.  The data needs to be moved
25//        into a resource and loaded from there.
260x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
270x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
280x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
290x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
300x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
310x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
33static const uint16_t commonChars_euc_jp[] = {
34// TODO:  This set of data comes from the character frequency-
35//        of-occurence analysis tool.  The data needs to be moved
36//        into a resource and loaded from there.
370xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
380xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
390xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
400xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
410xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
420xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
430xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
440xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
450xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
460xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
48static const uint16_t commonChars_euc_kr[] = {
49// TODO:  This set of data comes from the character frequency-
50//        of-occurence analysis tool.  The data needs to be moved
51//        into a resource and loaded from there.
520xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
530xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
540xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
550xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
560xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
570xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
580xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
590xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
600xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
610xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
63static const uint16_t commonChars_big5[] = {
64// TODO:  This set of data comes from the character frequency-
65//        of-occurence analysis tool.  The data needs to be moved
66//        into a resource and loaded from there.
670xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
680xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
690xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
700xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
710xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
720xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
730xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
740xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
750xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
760xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
78static const uint16_t commonChars_gb_18030[] = {
79// TODO:  This set of data comes from the character frequency-
80//        of-occurence analysis tool.  The data needs to be moved
81//        into a resource and loaded from there.
820xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
830xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
840xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
850xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
860xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
870xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
880xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
890xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
900xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
910xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
93static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
94{
95    int32_t start = 0, end = len-1;
96    int32_t mid = (start+end)/2;
97
98    while(start <= end) {
99        if(array[mid] == value) {
100            return mid;
101        }
102
103        if(array[mid] < value){
104            start = mid+1;
105        } else {
106            end = mid-1;
107        }
108
109        mid = (start+end)/2;
110    }
111
112    return -1;
113}
114
115IteratedChar::IteratedChar() :
116charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
117{
118    // nothing else to do.
119}
120
121/*void IteratedChar::reset()
122{
123    charValue = 0;
124    index     = -1;
125    nextIndex = 0;
126    error     = FALSE;
127    done      = FALSE;
128}*/
129
130int32_t IteratedChar::nextByte(InputText *det)
131{
132    if (nextIndex >= det->fRawLength) {
133        done = TRUE;
134
135        return -1;
136    }
137
138    return det->fRawInput[nextIndex++];
139}
140
141CharsetRecog_mbcs::~CharsetRecog_mbcs()
142{
143    // nothing to do.
144}
145
146int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
147    int32_t singleByteCharCount = 0;
148    int32_t doubleByteCharCount = 0;
149    int32_t commonCharCount     = 0;
150    int32_t badCharCount        = 0;
151    int32_t totalCharCount      = 0;
152    int32_t confidence          = 0;
153    IteratedChar iter;
154
155    while (nextChar(&iter, det)) {
156        totalCharCount++;
157
158        if (iter.error) {
159            badCharCount++;
160        } else {
161            if (iter.charValue <= 0xFF) {
162                singleByteCharCount++;
163            } else {
164                doubleByteCharCount++;
165
166                if (commonChars != 0) {
167                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
168                        commonCharCount += 1;
169                    }
170                }
171            }
172        }
173
174
175        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
176            // Bail out early if the byte data is not matching the encoding scheme.
177            // break detectBlock;
178            return confidence;
179        }
180    }
181
182    if (doubleByteCharCount <= 10 && badCharCount == 0) {
183        // Not many multi-byte chars.
184        if (doubleByteCharCount == 0 && totalCharCount < 10) {
185            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
186            // We don't have enough data to have any confidence.
187            // Statistical analysis of single byte non-ASCII charcters would probably help here.
188            confidence = 0;
189        }
190        else {
191            //   ASCII or ISO file?  It's probably not our encoding,
192            //   but is not incompatible with our encoding, so don't give it a zero.
193            confidence = 10;
194        }
195
196        return confidence;
197    }
198
199    //
200    //  No match if there are too many characters that don't fit the encoding scheme.
201    //    (should we have zero tolerance for these?)
202    //
203    if (doubleByteCharCount < 20*badCharCount) {
204        confidence = 0;
205
206        return confidence;
207    }
208
209    if (commonChars == 0) {
210        // We have no statistics on frequently occuring characters.
211        //  Assess confidence purely on having a reasonable number of
212        //  multi-byte characters (the more the better)
213        confidence = 30 + doubleByteCharCount - 20*badCharCount;
214
215        if (confidence > 100) {
216            confidence = 100;
217        }
218    } else {
219        //
220        // Frequency of occurence statistics exist.
221        //
222
223        double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
224        double scaleFactor = 90.0 / maxVal;
225        confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
226
227        confidence = min(confidence, 100);
228    }
229
230    if (confidence < 0) {
231        confidence = 0;
232    }
233
234    return confidence;
235}
236
237CharsetRecog_sjis::~CharsetRecog_sjis()
238{
239    // nothing to do
240}
241
242UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
243    it->index = it->nextIndex;
244    it->error = FALSE;
245
246    int32_t firstByte = it->charValue = it->nextByte(det);
247
248    if (firstByte < 0) {
249        return FALSE;
250    }
251
252    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
253        return TRUE;
254    }
255
256    int32_t secondByte = it->nextByte(det);
257    if (secondByte >= 0) {
258        it->charValue = (firstByte << 8) | secondByte;
259    }
260    // else we'll handle the error later.
261
262    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
263        // Illegal second byte value.
264        it->error = TRUE;
265    }
266
267    return TRUE;
268}
269
270int32_t CharsetRecog_sjis::match(InputText* det)
271{
272    return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273}
274
275const char *CharsetRecog_sjis::getName() const
276{
277    return "Shift_JIS";
278}
279
280const char *CharsetRecog_sjis::getLanguage() const
281{
282    return "ja";
283}
284
285CharsetRecog_euc::~CharsetRecog_euc()
286{
287    // nothing to do
288}
289
290UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
291    int32_t firstByte  = 0;
292    int32_t secondByte = 0;
293    int32_t thirdByte  = 0;
294
295    it->index = it->nextIndex;
296    it->error = FALSE;
297    firstByte = it->charValue = it->nextByte(det);
298
299    if (firstByte < 0) {
300        // Ran off the end of the input data
301        return FALSE;
302    }
303
304    if (firstByte <= 0x8D) {
305        // single byte char
306        return TRUE;
307    }
308
309    secondByte = it->nextByte(det);
310    if (secondByte >= 0) {
311        it->charValue = (it->charValue << 8) | secondByte;
312    }
313    // else we'll handle the error later.
314
315    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316        // Two byte Char
317        if (secondByte < 0xA1) {
318            it->error = TRUE;
319        }
320
321        return TRUE;
322    }
323
324    if (firstByte == 0x8E) {
325        // Code Set 2.
326        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328        // We don't know which we've got.
329        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
330        //   bytes will look like a well formed 2 byte char.
331        if (secondByte < 0xA1) {
332            it->error = TRUE;
333        }
334
335        return TRUE;
336    }
337
338    if (firstByte == 0x8F) {
339        // Code set 3.
340        // Three byte total char size, two bytes of actual char value.
341        thirdByte    = it->nextByte(det);
342        it->charValue = (it->charValue << 8) | thirdByte;
343
344        if (thirdByte < 0xa1) {
345            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346            it->error = TRUE;
347        }
348    }
349
350    return TRUE;
351
352}
353
354CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355{
356    // nothing to do
357}
358
359const char *CharsetRecog_euc_jp::getName() const
360{
361    return "EUC-JP";
362}
363
364const char *CharsetRecog_euc_jp::getLanguage() const
365{
366    return "ja";
367}
368
369int32_t CharsetRecog_euc_jp::match(InputText *det)
370{
371    return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
372}
373
374CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
375{
376    // nothing to do
377}
378
379const char *CharsetRecog_euc_kr::getName() const
380{
381    return "EUC-KR";
382}
383
384const char *CharsetRecog_euc_kr::getLanguage() const
385{
386    return "ko";
387}
388
389int32_t CharsetRecog_euc_kr::match(InputText *det)
390{
391    return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
392}
393
394CharsetRecog_big5::~CharsetRecog_big5()
395{
396    // nothing to do
397}
398
399UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
400{
401    int32_t firstByte;
402
403    it->index = it->nextIndex;
404    it->error = FALSE;
405    firstByte = it->charValue = it->nextByte(det);
406
407    if (firstByte < 0) {
408        return FALSE;
409    }
410
411    if (firstByte <= 0x7F || firstByte == 0xFF) {
412        // single byte character.
413        return TRUE;
414    }
415
416    int32_t secondByte = it->nextByte(det);
417    if (secondByte >= 0)  {
418        it->charValue = (it->charValue << 8) | secondByte;
419    }
420    // else we'll handle the error later.
421
422    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
423        it->error = TRUE;
424    }
425
426    return TRUE;
427}
428
429const char *CharsetRecog_big5::getName() const
430{
431    return "Big5";
432}
433
434const char *CharsetRecog_big5::getLanguage() const
435{
436    return "zh";
437}
438
439int32_t CharsetRecog_big5::match(InputText *det)
440{
441    return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
442}
443
444CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
445{
446    // nothing to do
447}
448
449UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
450    int32_t firstByte  = 0;
451    int32_t secondByte = 0;
452    int32_t thirdByte  = 0;
453    int32_t fourthByte = 0;
454
455    it->index = it->nextIndex;
456    it->error = FALSE;
457    firstByte = it->charValue = it->nextByte(det);
458
459    if (firstByte < 0) {
460        // Ran off the end of the input data
461        return FALSE;
462    }
463
464    if (firstByte <= 0x80) {
465        // single byte char
466        return TRUE;
467    }
468
469    secondByte = it->nextByte(det);
470    if (secondByte >= 0) {
471        it->charValue = (it->charValue << 8) | secondByte;
472    }
473    // else we'll handle the error later.
474
475    if (firstByte >= 0x81 && firstByte <= 0xFE) {
476        // Two byte Char
477        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
478            return TRUE;
479        }
480
481        // Four byte char
482        if (secondByte >= 0x30 && secondByte <= 0x39) {
483            thirdByte = it->nextByte(det);
484
485            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
486                fourthByte = it->nextByte(det);
487
488                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
489                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
490
491                    return TRUE;
492                }
493            }
494        }
495
496        // Something wasn't valid, or we ran out of data (-1).
497        it->error = TRUE;
498    }
499
500    return TRUE;
501}
502
503const char *CharsetRecog_gb_18030::getName() const
504{
505    return "GB18030";
506}
507
508const char *CharsetRecog_gb_18030::getLanguage() const
509{
510    return "zh";
511}
512
513int32_t CharsetRecog_gb_18030::match(InputText *det)
514{
515    return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
516}
517
518U_NAMESPACE_END
519#endif
520