csdetect.cpp revision ac04d0bbe12b3ef54518635711412f178cb4d16
1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Copyright (C) 2005-2007, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uenumimp.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "umutex.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ucln_in.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uarrsort.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "inputext.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrsbcs.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrmbcs.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrutf8.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrucode.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csr2022.h"
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array))
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t fCSRecognizers_size = 0;
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UBool U_CALLCONV csdet_cleanup(void)
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fCSRecognizers != NULL) {
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            delete fCSRecognizers[r];
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fCSRecognizers[r] = NULL;
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        DELETE_ARRAY(fCSRecognizers);
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fCSRecognizers = NULL;
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fCSRecognizers_size = 0;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerucharsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_NAMESPACE_USE
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const CharsetMatch **csm_l = (const CharsetMatch **) left;
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const CharsetMatch **csm_r = (const CharsetMatch **) right;
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // NOTE: compare is backwards to sort from highest to lowest.
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setRecognizers(UErrorCode &status)
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool needsInit;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetRecognizer **recognizers;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (needsInit) {
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        CharsetRecognizer *tempArray[] = {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF8(),
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_16_BE(),
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_16_LE(),
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_32_BE(),
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_32_LE(),
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_en(),
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_da(),
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_de(),
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_es(),
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_fr(),
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_it(),
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_nl(),
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_no(),
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_pt(),
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_1_sv(),
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_2_cs(),
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_2_hu(),
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_2_pl(),
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_2_ro(),
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_5_ru(),
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_6_ar(),
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_7_el(),
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_8_I_he(),
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_8_he(),
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_windows_1251(),
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_windows_1256(),
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_KOI8_R(),
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_9_tr(),
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_sjis(),
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_gb_18030(),
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_euc_jp(),
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_euc_kr(),
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_big5(),
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_2022JP(),
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_2022KR(),
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_2022CN()
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        };
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t rCount = ARRAY_SIZE(tempArray);
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t r;
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (recognizers == NULL) {
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            status = U_MEMORY_ALLOCATION_ERROR;
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for (r = 0; r < rCount; r += 1) {
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                recognizers[r] = tempArray[r];
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (recognizers[r] == NULL) {
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    status = U_MEMORY_ALLOCATION_ERROR;
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (U_SUCCESS(status)) {
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            umtx_lock(NULL);
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (fCSRecognizers == NULL) {
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fCSRecognizers = recognizers;
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fCSRecognizers_size = rCount;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            umtx_unlock(NULL);
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (fCSRecognizers != recognizers) {
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for (r = 0; r < rCount; r += 1) {
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                delete recognizers[r];
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                recognizers[r] = NULL;
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            DELETE_ARRAY(recognizers);
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        recognizers = NULL;
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::CharsetDetector(UErrorCode &status)
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  : textIn(new InputText()), resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    setRecognizers(status);
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (resultArray == NULL) {
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        resultArray[i] = new CharsetMatch();
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (resultArray[i] == NULL) {
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            status = U_MEMORY_ALLOCATION_ERROR;
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::~CharsetDetector()
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete textIn;
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete resultArray[i];
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_free(resultArray);
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setText(const char *in, int32_t len)
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    textIn->setText(in, len);
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFreshTextSet = TRUE;
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::setStripTagsFlag(UBool flag)
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool temp = fStripTags;
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fStripTags = flag;
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFreshTextSet = TRUE;
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return temp;
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::getStripTagsFlag() const
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fStripTags;
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    textIn->setDeclaredEncoding(encoding,len);
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t CharsetDetector::getDetectableCount()
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    setRecognizers(status);
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fCSRecognizers_size;
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch *CharsetDetector::detect(UErrorCode &status)
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t maxMatchesFound = 0;
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    detectAll(maxMatchesFound, status);
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(maxMatchesFound > 0) {
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return resultArray[0];
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!textIn->isSet()) {
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if(fFreshTextSet) {
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        CharsetRecognizer *csr;
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t            detectResults;
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t            confidence;
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t            i;
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        textIn->MungeInput(fStripTags);
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Iterate over all possible charsets, remember all that
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // give a match quality > 0.
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        resultCount = 0;
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for (i = 0; i < fCSRecognizers_size; i += 1) {
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            csr = fCSRecognizers[i];
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            detectResults = csr->match(textIn);
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            confidence = detectResults;
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (confidence > 0)  {
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                resultArray[resultCount++]->set(textIn, csr, confidence);
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i = resultCount; i < fCSRecognizers_size; i += 1) {
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            resultArray[i]->set(textIn, 0, 0);
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ////Bubble sort
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //for(int32_t i = resultCount; i > 1; i -= 1) {
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //    for(int32_t j = 0; j < i-1; j += 1) {
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //        if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //            CharsetMatch *temp = resultArray[j];
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //            resultArray[j] = resultArray[j+1];
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //            resultArray[j+1] = temp;
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //        }
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //    }
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //}
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fFreshTextSet = FALSE;
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    maxMatchesFound = resultCount;
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return resultArray;
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if( index > fCSRecognizers_size-1 || index < 0) {
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_INDEX_OUTOFBOUNDS_ERROR;
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return fCSRecognizers[index]->getName();
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutypedef struct {
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t currIndex;
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} Context;
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumClose(UEnumeration *en) {
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(en->context != NULL) {
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        DELETE_ARRAY(en->context);
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    DELETE_ARRAY(en);
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumCount(UEnumeration *, UErrorCode *) {
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fCSRecognizers_size;
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const char* U_CALLCONV
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(resultLength != NULL) {
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            *resultLength = 0;
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(resultLength != NULL) {
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        *resultLength = (int32_t)uprv_strlen(currName);
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((Context *)en->context)->currIndex++;
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return currName;
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumReset(UEnumeration *en, UErrorCode *) {
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((Context *)en->context)->currIndex = 0;
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UEnumeration gCSDetEnumeration = {
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    NULL,
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    NULL,
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumClose,
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumCount,
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uenum_unextDefault,
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumNext,
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumReset
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI  UEnumeration * U_EXPORT2
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_NAMESPACE_USE
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* Initialize recognized charsets. */
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector::getDetectableCount();
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    en->context = (void*)NEW_ARRAY(Context, 1);
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(en->context, 0, sizeof(Context));
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return en;
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
392