1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius *   Copyright (C) 2005-2012, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uenumimp.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "umutex.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ucln_in.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uarrsort.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "inputext.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrsbcs.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrmbcs.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrutf8.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrucode.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csr2022.h"
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array))
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN
36103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusstatic icu::CharsetRecognizer **fCSRecognizers = NULL;
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t fCSRecognizers_size = 0;
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UBool U_CALLCONV csdet_cleanup(void)
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fCSRecognizers != NULL) {
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            delete fCSRecognizers[r];
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fCSRecognizers[r] = NULL;
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        DELETE_ARRAY(fCSRecognizers);
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fCSRecognizers = NULL;
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fCSRecognizers_size = 0;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerucharsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_NAMESPACE_USE
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const CharsetMatch **csm_l = (const CharsetMatch **) left;
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const CharsetMatch **csm_r = (const CharsetMatch **) right;
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // NOTE: compare is backwards to sort from highest to lowest.
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setRecognizers(UErrorCode &status)
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool needsInit;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetRecognizer **recognizers;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (needsInit) {
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        CharsetRecognizer *tempArray[] = {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF8(),
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_16_BE(),
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_16_LE(),
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_32_BE(),
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_UTF_32_LE(),
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
9254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            new CharsetRecog_8859_1(),
9354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            new CharsetRecog_8859_2(),
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_5_ru(),
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_6_ar(),
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_7_el(),
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_8_I_he(),
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_8_he(),
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_windows_1251(),
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_windows_1256(),
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_KOI8_R(),
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_8859_9_tr(),
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_sjis(),
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_gb_18030(),
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_euc_jp(),
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_euc_kr(),
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_big5(),
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_2022JP(),
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            new CharsetRecog_2022KR(),
11185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            new CharsetRecog_2022CN(),
11285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
11385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            new CharsetRecog_IBM424_he_rtl(),
11485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            new CharsetRecog_IBM424_he_ltr(),
11585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            new CharsetRecog_IBM420_ar_rtl(),
11685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            new CharsetRecog_IBM420_ar_ltr()
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        };
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t rCount = ARRAY_SIZE(tempArray);
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t r;
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (recognizers == NULL) {
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            status = U_MEMORY_ALLOCATION_ERROR;
12585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            return;
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for (r = 0; r < rCount; r += 1) {
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                recognizers[r] = tempArray[r];
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (recognizers[r] == NULL) {
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    status = U_MEMORY_ALLOCATION_ERROR;
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (U_SUCCESS(status)) {
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            umtx_lock(NULL);
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (fCSRecognizers == NULL) {
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fCSRecognizers_size = rCount;
14185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                fCSRecognizers = recognizers;
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            umtx_unlock(NULL);
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (fCSRecognizers != recognizers) {
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for (r = 0; r < rCount; r += 1) {
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                delete recognizers[r];
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                recognizers[r] = NULL;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            DELETE_ARRAY(recognizers);
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        recognizers = NULL;
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::CharsetDetector(UErrorCode &status)
16185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : textIn(new InputText(status)), resultArray(NULL),
16285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    setRecognizers(status);
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (resultArray == NULL) {
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        resultArray[i] = new CharsetMatch();
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (resultArray[i] == NULL) {
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            status = U_MEMORY_ALLOCATION_ERROR;
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::~CharsetDetector()
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete textIn;
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete resultArray[i];
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_free(resultArray);
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setText(const char *in, int32_t len)
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    textIn->setText(in, len);
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFreshTextSet = TRUE;
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::setStripTagsFlag(UBool flag)
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool temp = fStripTags;
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fStripTags = flag;
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFreshTextSet = TRUE;
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return temp;
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::getStripTagsFlag() const
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fStripTags;
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    textIn->setDeclaredEncoding(encoding,len);
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t CharsetDetector::getDetectableCount()
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    setRecognizers(status);
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fCSRecognizers_size;
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch *CharsetDetector::detect(UErrorCode &status)
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t maxMatchesFound = 0;
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    detectAll(maxMatchesFound, status);
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(maxMatchesFound > 0) {
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return resultArray[0];
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!textIn->isSet()) {
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
25454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    } else if (fFreshTextSet) {
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        CharsetRecognizer *csr;
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t            i;
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        textIn->MungeInput(fStripTags);
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Iterate over all possible charsets, remember all that
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // give a match quality > 0.
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        resultCount = 0;
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for (i = 0; i < fCSRecognizers_size; i += 1) {
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            csr = fCSRecognizers[i];
26554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if (csr->match(textIn, resultArray[resultCount])) {
26654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                resultCount++;
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
27054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (resultCount > 1) {
27154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
27285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fFreshTextSet = FALSE;
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    maxMatchesFound = resultCount;
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return resultArray;
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if( index > fCSRecognizers_size-1 || index < 0) {
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_INDEX_OUTOFBOUNDS_ERROR;
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return fCSRecognizers[index]->getName();
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutypedef struct {
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t currIndex;
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} Context;
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumClose(UEnumeration *en) {
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(en->context != NULL) {
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        DELETE_ARRAY(en->context);
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    DELETE_ARRAY(en);
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumCount(UEnumeration *, UErrorCode *) {
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fCSRecognizers_size;
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const char* U_CALLCONV
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(resultLength != NULL) {
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            *resultLength = 0;
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(resultLength != NULL) {
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        *resultLength = (int32_t)uprv_strlen(currName);
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((Context *)en->context)->currIndex++;
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return currName;
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumReset(UEnumeration *en, UErrorCode *) {
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((Context *)en->context)->currIndex = 0;
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UEnumeration gCSDetEnumeration = {
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    NULL,
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    NULL,
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumClose,
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumCount,
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uenum_unextDefault,
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumNext,
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enumReset
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI  UEnumeration * U_EXPORT2
34885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Houcsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_NAMESPACE_USE
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* Initialize recognized charsets. */
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector::getDetectableCount();
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    en->context = (void*)NEW_ARRAY(Context, 1);
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(en->context, 0, sizeof(Context));
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return en;
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
369