1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_NAMESPACE_BEGIN
36
37struct CSRecognizerInfo : public UMemory {
38    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
40
41    ~CSRecognizerInfo() {delete recognizer;};
42
43    CharsetRecognizer *recognizer;
44    UBool isDefaultEnabled;
45};
46
47U_NAMESPACE_END
48
49static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50static icu::UInitOnce gCSRecognizersInitOnce;
51static int32_t fCSRecognizers_size = 0;
52
53U_CDECL_BEGIN
54static UBool U_CALLCONV csdet_cleanup(void)
55{
56    U_NAMESPACE_USE
57    if (fCSRecognizers != NULL) {
58        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59            delete fCSRecognizers[r];
60            fCSRecognizers[r] = NULL;
61        }
62
63        DELETE_ARRAY(fCSRecognizers);
64        fCSRecognizers = NULL;
65        fCSRecognizers_size = 0;
66    }
67    gCSRecognizersInitOnce.reset();
68
69    return TRUE;
70}
71
72static int32_t U_CALLCONV
73charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74{
75    U_NAMESPACE_USE
76
77    const CharsetMatch **csm_l = (const CharsetMatch **) left;
78    const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80    // NOTE: compare is backwards to sort from highest to lowest.
81    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82}
83
84static void U_CALLCONV initRecognizers(UErrorCode &status) {
85    U_NAMESPACE_USE
86    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87    CSRecognizerInfo *tempArray[] = {
88        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
90        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
95        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
112        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
114        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
115
116        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
117        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
118        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
119        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
120    };
121    int32_t rCount = ARRAY_SIZE(tempArray);
122
123    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
124
125    if (fCSRecognizers == NULL) {
126        status = U_MEMORY_ALLOCATION_ERROR;
127    }
128    else {
129        fCSRecognizers_size = rCount;
130        for (int32_t r = 0; r < rCount; r += 1) {
131            fCSRecognizers[r] = tempArray[r];
132            if (fCSRecognizers[r] == NULL) {
133                status = U_MEMORY_ALLOCATION_ERROR;
134            }
135        }
136    }
137}
138
139U_CDECL_END
140
141U_NAMESPACE_BEGIN
142
143void CharsetDetector::setRecognizers(UErrorCode &status)
144{
145    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
146}
147
148CharsetDetector::CharsetDetector(UErrorCode &status)
149  : textIn(new InputText(status)), resultArray(NULL),
150    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
151    fEnabledRecognizers(NULL)
152{
153    if (U_FAILURE(status)) {
154        return;
155    }
156
157    setRecognizers(status);
158
159    if (U_FAILURE(status)) {
160        return;
161    }
162
163    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
164
165    if (resultArray == NULL) {
166        status = U_MEMORY_ALLOCATION_ERROR;
167        return;
168    }
169
170    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
171        resultArray[i] = new CharsetMatch();
172
173        if (resultArray[i] == NULL) {
174            status = U_MEMORY_ALLOCATION_ERROR;
175            break;
176        }
177    }
178}
179
180CharsetDetector::~CharsetDetector()
181{
182    delete textIn;
183
184    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
185        delete resultArray[i];
186    }
187
188    uprv_free(resultArray);
189
190    if (fEnabledRecognizers) {
191        uprv_free(fEnabledRecognizers);
192    }
193}
194
195void CharsetDetector::setText(const char *in, int32_t len)
196{
197    textIn->setText(in, len);
198    fFreshTextSet = TRUE;
199}
200
201UBool CharsetDetector::setStripTagsFlag(UBool flag)
202{
203    UBool temp = fStripTags;
204    fStripTags = flag;
205    fFreshTextSet = TRUE;
206    return temp;
207}
208
209UBool CharsetDetector::getStripTagsFlag() const
210{
211    return fStripTags;
212}
213
214void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
215{
216    textIn->setDeclaredEncoding(encoding,len);
217}
218
219int32_t CharsetDetector::getDetectableCount()
220{
221    UErrorCode status = U_ZERO_ERROR;
222
223    setRecognizers(status);
224
225    return fCSRecognizers_size;
226}
227
228const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
229{
230    int32_t maxMatchesFound = 0;
231
232    detectAll(maxMatchesFound, status);
233
234    if(maxMatchesFound > 0) {
235        return resultArray[0];
236    } else {
237        return NULL;
238    }
239}
240
241const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
242{
243    if(!textIn->isSet()) {
244        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
245
246        return NULL;
247    } else if (fFreshTextSet) {
248        CharsetRecognizer *csr;
249        int32_t            i;
250
251        textIn->MungeInput(fStripTags);
252
253        // Iterate over all possible charsets, remember all that
254        // give a match quality > 0.
255        resultCount = 0;
256        for (i = 0; i < fCSRecognizers_size; i += 1) {
257            csr = fCSRecognizers[i]->recognizer;
258            if (csr->match(textIn, resultArray[resultCount])) {
259                resultCount++;
260            }
261        }
262
263        if (resultCount > 1) {
264            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
265        }
266        fFreshTextSet = FALSE;
267    }
268
269    maxMatchesFound = resultCount;
270
271    return resultArray;
272}
273
274void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
275{
276    if (U_FAILURE(status)) {
277        return;
278    }
279
280    int32_t modIdx = -1;
281    UBool isDefaultVal = FALSE;
282    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
283        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
284        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
285            modIdx = i;
286            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
287            break;
288        }
289    }
290    if (modIdx < 0) {
291        // No matching encoding found
292        status = U_ILLEGAL_ARGUMENT_ERROR;
293        return;
294    }
295
296    if (fEnabledRecognizers == NULL && !isDefaultVal) {
297        // Create an array storing the non default setting
298        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
299        if (fEnabledRecognizers == NULL) {
300            status = U_MEMORY_ALLOCATION_ERROR;
301            return;
302        }
303        // Initialize the array with default info
304        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
305            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
306        }
307    }
308
309    if (fEnabledRecognizers != NULL) {
310        fEnabledRecognizers[modIdx] = enabled;
311    }
312}
313
314/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
315{
316    if( index > fCSRecognizers_size-1 || index < 0) {
317        status = U_INDEX_OUTOFBOUNDS_ERROR;
318
319        return 0;
320    } else {
321        return fCSRecognizers[index]->getName();
322    }
323}*/
324
325U_NAMESPACE_END
326
327U_CDECL_BEGIN
328typedef struct {
329    int32_t currIndex;
330    UBool all;
331    UBool *enabledRecognizers;
332} Context;
333
334
335
336static void U_CALLCONV
337enumClose(UEnumeration *en) {
338    if(en->context != NULL) {
339        DELETE_ARRAY(en->context);
340    }
341
342    DELETE_ARRAY(en);
343}
344
345static int32_t U_CALLCONV
346enumCount(UEnumeration *en, UErrorCode *) {
347    if (((Context *)en->context)->all) {
348        // ucsdet_getAllDetectableCharsets, all charset detector names
349        return fCSRecognizers_size;
350    }
351
352    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
353    int32_t count = 0;
354    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
355    if (enabledArray != NULL) {
356        // custom set
357        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
358            if (enabledArray[i]) {
359                count++;
360            }
361        }
362    } else {
363        // default set
364        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365            if (fCSRecognizers[i]->isDefaultEnabled) {
366                count++;
367            }
368        }
369    }
370    return count;
371}
372
373static const char* U_CALLCONV
374enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
375    const char *currName = NULL;
376
377    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
378        if (((Context *)en->context)->all) {
379            // ucsdet_getAllDetectableCharsets, all charset detector names
380            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
381            ((Context *)en->context)->currIndex++;
382        } else {
383            // ucsdet_getDetectableCharsets
384            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
385            if (enabledArray != NULL) {
386                // custome set
387                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
388                    if (enabledArray[((Context *)en->context)->currIndex]) {
389                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
390                    }
391                    ((Context *)en->context)->currIndex++;
392                }
393            } else {
394                // default set
395                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
396                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
397                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
398                    }
399                    ((Context *)en->context)->currIndex++;
400                }
401            }
402        }
403    }
404
405    if(resultLength != NULL) {
406        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
407    }
408
409    return currName;
410}
411
412
413static void U_CALLCONV
414enumReset(UEnumeration *en, UErrorCode *) {
415    ((Context *)en->context)->currIndex = 0;
416}
417
418static const UEnumeration gCSDetEnumeration = {
419    NULL,
420    NULL,
421    enumClose,
422    enumCount,
423    uenum_unextDefault,
424    enumNext,
425    enumReset
426};
427
428U_CDECL_END
429
430U_NAMESPACE_BEGIN
431
432UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
433{
434
435    /* Initialize recognized charsets. */
436    setRecognizers(status);
437
438    if(U_FAILURE(status)) {
439        return 0;
440    }
441
442    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
443    if (en == NULL) {
444        status = U_MEMORY_ALLOCATION_ERROR;
445        return 0;
446    }
447    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
448    en->context = (void*)NEW_ARRAY(Context, 1);
449    if (en->context == NULL) {
450        status = U_MEMORY_ALLOCATION_ERROR;
451        DELETE_ARRAY(en);
452        return 0;
453    }
454    uprv_memset(en->context, 0, sizeof(Context));
455    ((Context*)en->context)->all = TRUE;
456    return en;
457}
458
459UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
460{
461    if(U_FAILURE(status)) {
462        return 0;
463    }
464
465    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
466    if (en == NULL) {
467        status = U_MEMORY_ALLOCATION_ERROR;
468        return 0;
469    }
470    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
471    en->context = (void*)NEW_ARRAY(Context, 1);
472    if (en->context == NULL) {
473        status = U_MEMORY_ALLOCATION_ERROR;
474        DELETE_ARRAY(en);
475        return 0;
476    }
477    uprv_memset(en->context, 0, sizeof(Context));
478    ((Context*)en->context)->all = FALSE;
479    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
480    return en;
481}
482
483U_NAMESPACE_END
484
485#endif
486