1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_CDECL_BEGIN
36static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37
38static int32_t fCSRecognizers_size = 0;
39
40static UBool U_CALLCONV csdet_cleanup(void)
41{
42    if (fCSRecognizers != NULL) {
43        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44            delete fCSRecognizers[r];
45            fCSRecognizers[r] = NULL;
46        }
47
48        DELETE_ARRAY(fCSRecognizers);
49        fCSRecognizers = NULL;
50        fCSRecognizers_size = 0;
51    }
52
53    return TRUE;
54}
55
56static int32_t U_CALLCONV
57charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58{
59    U_NAMESPACE_USE
60
61    const CharsetMatch **csm_l = (const CharsetMatch **) left;
62    const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64    // NOTE: compare is backwards to sort from highest to lowest.
65    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66}
67
68U_CDECL_END
69
70U_NAMESPACE_BEGIN
71
72void CharsetDetector::setRecognizers(UErrorCode &status)
73{
74    UBool needsInit;
75    CharsetRecognizer **recognizers;
76
77    if (U_FAILURE(status)) {
78        return;
79    }
80
81    UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82
83    if (needsInit) {
84        CharsetRecognizer *tempArray[] = {
85            new CharsetRecog_UTF8(),
86
87            new CharsetRecog_UTF_16_BE(),
88            new CharsetRecog_UTF_16_LE(),
89            new CharsetRecog_UTF_32_BE(),
90            new CharsetRecog_UTF_32_LE(),
91
92            new CharsetRecog_8859_1_en(),
93            new CharsetRecog_8859_1_da(),
94            new CharsetRecog_8859_1_de(),
95            new CharsetRecog_8859_1_es(),
96            new CharsetRecog_8859_1_fr(),
97            new CharsetRecog_8859_1_it(),
98            new CharsetRecog_8859_1_nl(),
99            new CharsetRecog_8859_1_no(),
100            new CharsetRecog_8859_1_pt(),
101            new CharsetRecog_8859_1_sv(),
102            new CharsetRecog_8859_2_cs(),
103            new CharsetRecog_8859_2_hu(),
104            new CharsetRecog_8859_2_pl(),
105            new CharsetRecog_8859_2_ro(),
106            new CharsetRecog_8859_5_ru(),
107            new CharsetRecog_8859_6_ar(),
108            new CharsetRecog_8859_7_el(),
109            new CharsetRecog_8859_8_I_he(),
110            new CharsetRecog_8859_8_he(),
111            new CharsetRecog_windows_1251(),
112            new CharsetRecog_windows_1256(),
113            new CharsetRecog_KOI8_R(),
114            new CharsetRecog_8859_9_tr(),
115            new CharsetRecog_sjis(),
116            new CharsetRecog_gb_18030(),
117            new CharsetRecog_euc_jp(),
118            new CharsetRecog_euc_kr(),
119            new CharsetRecog_big5(),
120
121            new CharsetRecog_2022JP(),
122            new CharsetRecog_2022KR(),
123            new CharsetRecog_2022CN(),
124
125            new CharsetRecog_IBM424_he_rtl(),
126            new CharsetRecog_IBM424_he_ltr(),
127            new CharsetRecog_IBM420_ar_rtl(),
128            new CharsetRecog_IBM420_ar_ltr()
129        };
130        int32_t rCount = ARRAY_SIZE(tempArray);
131        int32_t r;
132
133        recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
134
135        if (recognizers == NULL) {
136            status = U_MEMORY_ALLOCATION_ERROR;
137            return;
138        } else {
139            for (r = 0; r < rCount; r += 1) {
140                recognizers[r] = tempArray[r];
141
142                if (recognizers[r] == NULL) {
143                    status = U_MEMORY_ALLOCATION_ERROR;
144                    break;
145                }
146            }
147        }
148
149        if (U_SUCCESS(status)) {
150            umtx_lock(NULL);
151            if (fCSRecognizers == NULL) {
152                fCSRecognizers_size = rCount;
153                fCSRecognizers = recognizers;
154            }
155            umtx_unlock(NULL);
156        }
157
158        if (fCSRecognizers != recognizers) {
159            for (r = 0; r < rCount; r += 1) {
160                delete recognizers[r];
161                recognizers[r] = NULL;
162            }
163
164            DELETE_ARRAY(recognizers);
165        }
166
167        recognizers = NULL;
168        ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
169    }
170}
171
172CharsetDetector::CharsetDetector(UErrorCode &status)
173  : textIn(new InputText(status)), resultArray(NULL),
174    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
175{
176    if (U_FAILURE(status)) {
177        return;
178    }
179
180    setRecognizers(status);
181
182    if (U_FAILURE(status)) {
183        return;
184    }
185
186    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
187
188    if (resultArray == NULL) {
189        status = U_MEMORY_ALLOCATION_ERROR;
190        return;
191    }
192
193    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
194        resultArray[i] = new CharsetMatch();
195
196        if (resultArray[i] == NULL) {
197            status = U_MEMORY_ALLOCATION_ERROR;
198            break;
199        }
200    }
201}
202
203CharsetDetector::~CharsetDetector()
204{
205    delete textIn;
206
207    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
208        delete resultArray[i];
209    }
210
211    uprv_free(resultArray);
212}
213
214void CharsetDetector::setText(const char *in, int32_t len)
215{
216    textIn->setText(in, len);
217    fFreshTextSet = TRUE;
218}
219
220UBool CharsetDetector::setStripTagsFlag(UBool flag)
221{
222    UBool temp = fStripTags;
223    fStripTags = flag;
224    fFreshTextSet = TRUE;
225    return temp;
226}
227
228UBool CharsetDetector::getStripTagsFlag() const
229{
230    return fStripTags;
231}
232
233void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
234{
235    textIn->setDeclaredEncoding(encoding,len);
236}
237
238int32_t CharsetDetector::getDetectableCount()
239{
240    UErrorCode status = U_ZERO_ERROR;
241
242    setRecognizers(status);
243
244    return fCSRecognizers_size;
245}
246
247const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
248{
249    int32_t maxMatchesFound = 0;
250
251    detectAll(maxMatchesFound, status);
252
253    if(maxMatchesFound > 0) {
254        return resultArray[0];
255    } else {
256        return NULL;
257    }
258}
259
260const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
261{
262    if(!textIn->isSet()) {
263        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
264
265        return NULL;
266    } else if(fFreshTextSet) {
267        CharsetRecognizer *csr;
268        int32_t            detectResults;
269        int32_t            confidence;
270        int32_t            i;
271
272        textIn->MungeInput(fStripTags);
273
274        // Iterate over all possible charsets, remember all that
275        // give a match quality > 0.
276        resultCount = 0;
277        for (i = 0; i < fCSRecognizers_size; i += 1) {
278            csr = fCSRecognizers[i];
279            detectResults = csr->match(textIn);
280            confidence = detectResults;
281
282            if (confidence > 0)  {
283                resultArray[resultCount++]->set(textIn, csr, confidence);
284            }
285        }
286
287        for(i = resultCount; i < fCSRecognizers_size; i += 1) {
288            resultArray[i]->set(textIn, 0, 0);
289        }
290
291        uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
292
293        // Remove duplicate charsets from the results.
294        // Simple minded, brute force approach - check each entry against all that follow.
295        // The first entry of any duplicated set is the one that should be kept because it will
296        // be the one with the highest confidence rating.
297        //   (Duplicate matches have different languages, only the charset is the same)
298        // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299        // deleted, just reordered, with the unwanted duplicates placed after the good results.
300        int32_t j, k;
301        for (i=0; i<resultCount; i++) {
302            const char *charSetName = resultArray[i]->getName();
303            for (j=i+1; j<resultCount; ) {
304                if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
305                    // Not a duplicate.
306                    j++;
307                } else {
308                    // Duplicate entry at index j.
309                    CharsetMatch *duplicate = resultArray[j];
310                    for (k=j; k<resultCount-1; k++) {
311                        resultArray[k] = resultArray[k+1];
312                    }
313                    resultCount--;
314                    resultArray[resultCount] = duplicate;
315                }
316            }
317        }
318
319        fFreshTextSet = FALSE;
320    }
321
322    maxMatchesFound = resultCount;
323
324    return resultArray;
325}
326
327/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
328{
329    if( index > fCSRecognizers_size-1 || index < 0) {
330        status = U_INDEX_OUTOFBOUNDS_ERROR;
331
332        return 0;
333    } else {
334        return fCSRecognizers[index]->getName();
335    }
336}*/
337
338U_NAMESPACE_END
339
340U_CDECL_BEGIN
341typedef struct {
342    int32_t currIndex;
343} Context;
344
345
346
347static void U_CALLCONV
348enumClose(UEnumeration *en) {
349    if(en->context != NULL) {
350        DELETE_ARRAY(en->context);
351    }
352
353    DELETE_ARRAY(en);
354}
355
356static int32_t U_CALLCONV
357enumCount(UEnumeration *, UErrorCode *) {
358    return fCSRecognizers_size;
359}
360
361static const char* U_CALLCONV
362enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
363    if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
364        if(resultLength != NULL) {
365            *resultLength = 0;
366        }
367        return NULL;
368    }
369    const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
370    if(resultLength != NULL) {
371        *resultLength = (int32_t)uprv_strlen(currName);
372    }
373    ((Context *)en->context)->currIndex++;
374
375    return currName;
376}
377
378static void U_CALLCONV
379enumReset(UEnumeration *en, UErrorCode *) {
380    ((Context *)en->context)->currIndex = 0;
381}
382
383static const UEnumeration gCSDetEnumeration = {
384    NULL,
385    NULL,
386    enumClose,
387    enumCount,
388    uenum_unextDefault,
389    enumNext,
390    enumReset
391};
392
393U_CAPI  UEnumeration * U_EXPORT2
394ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
395{
396    U_NAMESPACE_USE
397
398    if(U_FAILURE(*status)) {
399        return 0;
400    }
401
402    /* Initialize recognized charsets. */
403    CharsetDetector::getDetectableCount();
404
405    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
406    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
407    en->context = (void*)NEW_ARRAY(Context, 1);
408    uprv_memset(en->context, 0, sizeof(Context));
409    return en;
410}
411U_CDECL_END
412
413#endif
414
415