csdetect.cpp revision 64339d36f8bd4db5025fe2988eda22b491a9219c
1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 *   Copyright (C) 2005-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_CONVERSION
13
14#include "unicode/ucsdet.h"
15
16#include "csdetect.h"
17#include "csmatch.h"
18#include "uenumimp.h"
19
20#include "cmemory.h"
21#include "cstring.h"
22#include "umutex.h"
23#include "ucln_in.h"
24#include "uarrsort.h"
25#include "inputext.h"
26#include "csrsbcs.h"
27#include "csrmbcs.h"
28#include "csrutf8.h"
29#include "csrucode.h"
30#include "csr2022.h"
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_NAMESPACE_BEGIN
36
37struct CSRecognizerInfo : public UMemory {
38    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
40
41    ~CSRecognizerInfo() {delete recognizer;};
42
43    CharsetRecognizer *recognizer;
44    UBool isDefaultEnabled;
45};
46
47U_NAMESPACE_END
48
49static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50static icu::UInitOnce gCSRecognizersInitOnce;
51static int32_t fCSRecognizers_size = 0;
52
53U_CDECL_BEGIN
54static UBool U_CALLCONV csdet_cleanup(void)
55{
56    U_NAMESPACE_USE
57    if (fCSRecognizers != NULL) {
58        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59            delete fCSRecognizers[r];
60            fCSRecognizers[r] = NULL;
61        }
62
63        DELETE_ARRAY(fCSRecognizers);
64        fCSRecognizers = NULL;
65        fCSRecognizers_size = 0;
66    }
67    gCSRecognizersInitOnce.reset();
68
69    return TRUE;
70}
71
72static int32_t U_CALLCONV
73charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74{
75    U_NAMESPACE_USE
76
77    const CharsetMatch **csm_l = (const CharsetMatch **) left;
78    const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80    // NOTE: compare is backwards to sort from highest to lowest.
81    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82}
83
84static void U_CALLCONV initRecognizers(UErrorCode &status) {
85    U_NAMESPACE_USE
86    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87    CSRecognizerInfo *tempArray[] = {
88        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
90        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
95        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
112        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113#if !UCONFIG_ONLY_HTML_CONVERSION
114        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
115        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
116
117        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
118        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
119        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
120        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
121#endif
122    };
123    int32_t rCount = UPRV_LENGTHOF(tempArray);
124
125    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127    if (fCSRecognizers == NULL) {
128        status = U_MEMORY_ALLOCATION_ERROR;
129    }
130    else {
131        fCSRecognizers_size = rCount;
132        for (int32_t r = 0; r < rCount; r += 1) {
133            fCSRecognizers[r] = tempArray[r];
134            if (fCSRecognizers[r] == NULL) {
135                status = U_MEMORY_ALLOCATION_ERROR;
136            }
137        }
138    }
139}
140
141U_CDECL_END
142
143U_NAMESPACE_BEGIN
144
145void CharsetDetector::setRecognizers(UErrorCode &status)
146{
147    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148}
149
150CharsetDetector::CharsetDetector(UErrorCode &status)
151  : textIn(new InputText(status)), resultArray(NULL),
152    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
153    fEnabledRecognizers(NULL)
154{
155    if (U_FAILURE(status)) {
156        return;
157    }
158
159    setRecognizers(status);
160
161    if (U_FAILURE(status)) {
162        return;
163    }
164
165    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166
167    if (resultArray == NULL) {
168        status = U_MEMORY_ALLOCATION_ERROR;
169        return;
170    }
171
172    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173        resultArray[i] = new CharsetMatch();
174
175        if (resultArray[i] == NULL) {
176            status = U_MEMORY_ALLOCATION_ERROR;
177            break;
178        }
179    }
180}
181
182CharsetDetector::~CharsetDetector()
183{
184    delete textIn;
185
186    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187        delete resultArray[i];
188    }
189
190    uprv_free(resultArray);
191
192    if (fEnabledRecognizers) {
193        uprv_free(fEnabledRecognizers);
194    }
195}
196
197void CharsetDetector::setText(const char *in, int32_t len)
198{
199    textIn->setText(in, len);
200    fFreshTextSet = TRUE;
201}
202
203UBool CharsetDetector::setStripTagsFlag(UBool flag)
204{
205    UBool temp = fStripTags;
206    fStripTags = flag;
207    fFreshTextSet = TRUE;
208    return temp;
209}
210
211UBool CharsetDetector::getStripTagsFlag() const
212{
213    return fStripTags;
214}
215
216void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217{
218    textIn->setDeclaredEncoding(encoding,len);
219}
220
221int32_t CharsetDetector::getDetectableCount()
222{
223    UErrorCode status = U_ZERO_ERROR;
224
225    setRecognizers(status);
226
227    return fCSRecognizers_size;
228}
229
230const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231{
232    int32_t maxMatchesFound = 0;
233
234    detectAll(maxMatchesFound, status);
235
236    if(maxMatchesFound > 0) {
237        return resultArray[0];
238    } else {
239        return NULL;
240    }
241}
242
243const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244{
245    if(!textIn->isSet()) {
246        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
247
248        return NULL;
249    } else if (fFreshTextSet) {
250        CharsetRecognizer *csr;
251        int32_t            i;
252
253        textIn->MungeInput(fStripTags);
254
255        // Iterate over all possible charsets, remember all that
256        // give a match quality > 0.
257        resultCount = 0;
258        for (i = 0; i < fCSRecognizers_size; i += 1) {
259            csr = fCSRecognizers[i]->recognizer;
260            if (csr->match(textIn, resultArray[resultCount])) {
261                resultCount++;
262            }
263        }
264
265        if (resultCount > 1) {
266            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
267        }
268        fFreshTextSet = FALSE;
269    }
270
271    maxMatchesFound = resultCount;
272
273    return resultArray;
274}
275
276void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
277{
278    if (U_FAILURE(status)) {
279        return;
280    }
281
282    int32_t modIdx = -1;
283    UBool isDefaultVal = FALSE;
284    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
285        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
286        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
287            modIdx = i;
288            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
289            break;
290        }
291    }
292    if (modIdx < 0) {
293        // No matching encoding found
294        status = U_ILLEGAL_ARGUMENT_ERROR;
295        return;
296    }
297
298    if (fEnabledRecognizers == NULL && !isDefaultVal) {
299        // Create an array storing the non default setting
300        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
301        if (fEnabledRecognizers == NULL) {
302            status = U_MEMORY_ALLOCATION_ERROR;
303            return;
304        }
305        // Initialize the array with default info
306        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
307            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
308        }
309    }
310
311    if (fEnabledRecognizers != NULL) {
312        fEnabledRecognizers[modIdx] = enabled;
313    }
314}
315
316/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
317{
318    if( index > fCSRecognizers_size-1 || index < 0) {
319        status = U_INDEX_OUTOFBOUNDS_ERROR;
320
321        return 0;
322    } else {
323        return fCSRecognizers[index]->getName();
324    }
325}*/
326
327U_NAMESPACE_END
328
329U_CDECL_BEGIN
330typedef struct {
331    int32_t currIndex;
332    UBool all;
333    UBool *enabledRecognizers;
334} Context;
335
336
337
338static void U_CALLCONV
339enumClose(UEnumeration *en) {
340    if(en->context != NULL) {
341        DELETE_ARRAY(en->context);
342    }
343
344    DELETE_ARRAY(en);
345}
346
347static int32_t U_CALLCONV
348enumCount(UEnumeration *en, UErrorCode *) {
349    if (((Context *)en->context)->all) {
350        // ucsdet_getAllDetectableCharsets, all charset detector names
351        return fCSRecognizers_size;
352    }
353
354    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
355    int32_t count = 0;
356    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
357    if (enabledArray != NULL) {
358        // custom set
359        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
360            if (enabledArray[i]) {
361                count++;
362            }
363        }
364    } else {
365        // default set
366        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
367            if (fCSRecognizers[i]->isDefaultEnabled) {
368                count++;
369            }
370        }
371    }
372    return count;
373}
374
375static const char* U_CALLCONV
376enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
377    const char *currName = NULL;
378
379    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
380        if (((Context *)en->context)->all) {
381            // ucsdet_getAllDetectableCharsets, all charset detector names
382            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
383            ((Context *)en->context)->currIndex++;
384        } else {
385            // ucsdet_getDetectableCharsets
386            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
387            if (enabledArray != NULL) {
388                // custome set
389                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
390                    if (enabledArray[((Context *)en->context)->currIndex]) {
391                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
392                    }
393                    ((Context *)en->context)->currIndex++;
394                }
395            } else {
396                // default set
397                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
398                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
399                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
400                    }
401                    ((Context *)en->context)->currIndex++;
402                }
403            }
404        }
405    }
406
407    if(resultLength != NULL) {
408        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
409    }
410
411    return currName;
412}
413
414
415static void U_CALLCONV
416enumReset(UEnumeration *en, UErrorCode *) {
417    ((Context *)en->context)->currIndex = 0;
418}
419
420static const UEnumeration gCSDetEnumeration = {
421    NULL,
422    NULL,
423    enumClose,
424    enumCount,
425    uenum_unextDefault,
426    enumNext,
427    enumReset
428};
429
430U_CDECL_END
431
432U_NAMESPACE_BEGIN
433
434UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
435{
436
437    /* Initialize recognized charsets. */
438    setRecognizers(status);
439
440    if(U_FAILURE(status)) {
441        return 0;
442    }
443
444    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
445    if (en == NULL) {
446        status = U_MEMORY_ALLOCATION_ERROR;
447        return 0;
448    }
449    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
450    en->context = (void*)NEW_ARRAY(Context, 1);
451    if (en->context == NULL) {
452        status = U_MEMORY_ALLOCATION_ERROR;
453        DELETE_ARRAY(en);
454        return 0;
455    }
456    uprv_memset(en->context, 0, sizeof(Context));
457    ((Context*)en->context)->all = TRUE;
458    return en;
459}
460
461UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
462{
463    if(U_FAILURE(status)) {
464        return 0;
465    }
466
467    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
468    if (en == NULL) {
469        status = U_MEMORY_ALLOCATION_ERROR;
470        return 0;
471    }
472    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
473    en->context = (void*)NEW_ARRAY(Context, 1);
474    if (en->context == NULL) {
475        status = U_MEMORY_ALLOCATION_ERROR;
476        DELETE_ARRAY(en);
477        return 0;
478    }
479    uprv_memset(en->context, 0, sizeof(Context));
480    ((Context*)en->context)->all = FALSE;
481    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
482    return en;
483}
484
485U_NAMESPACE_END
486
487#endif
488