1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/*
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott **********************************************************************
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *   Copyright (C) 2005-2009, International Business Machines
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *   Corporation and others.  All Rights Reserved.
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott **********************************************************************
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/utypes.h"
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if !UCONFIG_NO_CONVERSION
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucsdet.h"
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csdetect.h"
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csmatch.h"
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "uenumimp.h"
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cmemory.h"
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cstring.h"
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "umutex.h"
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "ucln_in.h"
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "uarrsort.h"
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "inputext.h"
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrsbcs.h"
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrmbcs.h"
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrutf8.h"
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrucode.h"
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csr2022.h"
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define DELETE_ARRAY(array) uprv_free((void *) (array))
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_BEGIN
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t fCSRecognizers_size = 0;
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic UBool U_CALLCONV csdet_cleanup(void)
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (fCSRecognizers != NULL) {
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            delete fCSRecognizers[r];
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            fCSRecognizers[r] = NULL;
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        DELETE_ARRAY(fCSRecognizers);
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fCSRecognizers = NULL;
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fCSRecognizers_size = 0;
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return TRUE;
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t U_CALLCONV
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottcharsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    U_NAMESPACE_USE
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    const CharsetMatch **csm_l = (const CharsetMatch **) left;
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    const CharsetMatch **csm_r = (const CharsetMatch **) right;
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // NOTE: compare is backwards to sort from highest to lowest.
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_END
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_BEGIN
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setRecognizers(UErrorCode &status)
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    UBool needsInit;
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    CharsetRecognizer **recognizers;
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (U_FAILURE(status)) {
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return;
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (needsInit) {
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        CharsetRecognizer *tempArray[] = {
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_UTF8(),
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_UTF_16_BE(),
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_UTF_16_LE(),
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_UTF_32_BE(),
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_UTF_32_LE(),
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_en(),
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_da(),
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_de(),
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_es(),
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_fr(),
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_it(),
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_nl(),
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_no(),
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_pt(),
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_1_sv(),
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_2_cs(),
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_2_hu(),
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_2_pl(),
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_2_ro(),
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_5_ru(),
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_6_ar(),
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_7_el(),
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_8_I_he(),
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_8_he(),
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_windows_1251(),
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_windows_1256(),
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_KOI8_R(),
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_8859_9_tr(),
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_sjis(),
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_gb_18030(),
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_euc_jp(),
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_euc_kr(),
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_big5(),
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_2022JP(),
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_2022KR(),
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_2022CN(),
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_IBM424_he_rtl(),
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_IBM424_he_ltr(),
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_IBM420_ar_rtl(),
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            new CharsetRecog_IBM420_ar_ltr()
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        };
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t rCount = ARRAY_SIZE(tempArray);
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t r;
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (recognizers == NULL) {
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            status = U_MEMORY_ALLOCATION_ERROR;
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            return;
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        } else {
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            for (r = 0; r < rCount; r += 1) {
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                recognizers[r] = tempArray[r];
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                if (recognizers[r] == NULL) {
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    status = U_MEMORY_ALLOCATION_ERROR;
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    break;
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                }
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (U_SUCCESS(status)) {
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            umtx_lock(NULL);
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            if (fCSRecognizers == NULL) {
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                fCSRecognizers_size = rCount;
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                fCSRecognizers = recognizers;
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            umtx_unlock(NULL);
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (fCSRecognizers != recognizers) {
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            for (r = 0; r < rCount; r += 1) {
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                delete recognizers[r];
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                recognizers[r] = NULL;
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            DELETE_ARRAY(recognizers);
165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        recognizers = NULL;
168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCharsetDetector::CharsetDetector(UErrorCode &status)
173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  : textIn(new InputText(status)), resultArray(NULL),
174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (U_FAILURE(status)) {
177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return;
178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    setRecognizers(status);
181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (U_FAILURE(status)) {
183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return;
184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (resultArray == NULL) {
189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        status = U_MEMORY_ALLOCATION_ERROR;
190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return;
191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        resultArray[i] = new CharsetMatch();
195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (resultArray[i] == NULL) {
197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            status = U_MEMORY_ALLOCATION_ERROR;
198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            break;
199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCharsetDetector::~CharsetDetector()
204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    delete textIn;
206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        delete resultArray[i];
209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uprv_free(resultArray);
212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setText(const char *in, int32_t len)
215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    textIn->setText(in, len);
217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fFreshTextSet = TRUE;
218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool CharsetDetector::setStripTagsFlag(UBool flag)
221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    UBool temp = fStripTags;
223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fStripTags = flag;
224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fFreshTextSet = TRUE;
225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return temp;
226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool CharsetDetector::getStripTagsFlag() const
229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return fStripTags;
231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    textIn->setDeclaredEncoding(encoding,len);
236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t CharsetDetector::getDetectableCount()
239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    UErrorCode status = U_ZERO_ERROR;
241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    setRecognizers(status);
243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return fCSRecognizers_size;
245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CharsetMatch *CharsetDetector::detect(UErrorCode &status)
248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int32_t maxMatchesFound = 0;
250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    detectAll(maxMatchesFound, status);
252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(maxMatchesFound > 0) {
254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return resultArray[0];
255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    } else {
256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return NULL;
257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(!textIn->isSet()) {
263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return NULL;
266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    } else if(fFreshTextSet) {
267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        CharsetRecognizer *csr;
268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t            detectResults;
269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t            confidence;
270c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t            i;
271c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
272c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        textIn->MungeInput(fStripTags);
273c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
274c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Iterate over all possible charsets, remember all that
275c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // give a match quality > 0.
276c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        resultCount = 0;
277c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for (i = 0; i < fCSRecognizers_size; i += 1) {
278c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            csr = fCSRecognizers[i];
279c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            detectResults = csr->match(textIn);
280c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            confidence = detectResults;
281c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
282c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            if (confidence > 0)  {
283c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                resultArray[resultCount++]->set(textIn, csr, confidence);
284c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
285c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
286c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
287c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for(i = resultCount; i < fCSRecognizers_size; i += 1) {
288c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            resultArray[i]->set(textIn, 0, 0);
289c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
290c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
291c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
292c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
293c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Remove duplicate charsets from the results.
294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Simple minded, brute force approach - check each entry against all that follow.
295c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // The first entry of any duplicated set is the one that should be kept because it will
296c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // be the one with the highest confidence rating.
297c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        //   (Duplicate matches have different languages, only the charset is the same)
298c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // deleted, just reordered, with the unwanted duplicates placed after the good results.
300c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t j, k;
301c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for (i=0; i<resultCount; i++) {
302c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            const char *charSetName = resultArray[i]->getName();
303c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            for (j=i+1; j<resultCount; ) {
304c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
305c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    // Not a duplicate.
306c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    j++;
307c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                } else {
308c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    // Duplicate entry at index j.
309c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    CharsetMatch *duplicate = resultArray[j];
310c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    for (k=j; k<resultCount-1; k++) {
311c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                        resultArray[k] = resultArray[k+1];
312c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    }
313c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    resultCount--;
314c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    resultArray[resultCount] = duplicate;
315c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                }
316c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
317c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
318c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
319c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fFreshTextSet = FALSE;
320c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
321c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
322c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    maxMatchesFound = resultCount;
323c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
324c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return resultArray;
325c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
326c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
327c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
328c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
329c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if( index > fCSRecognizers_size-1 || index < 0) {
330c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        status = U_INDEX_OUTOFBOUNDS_ERROR;
331c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
332c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return 0;
333c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    } else {
334c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return fCSRecognizers[index]->getName();
335c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
336c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}*/
337c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
338c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_END
339c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
340c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_BEGIN
341c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttypedef struct {
342c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int32_t currIndex;
343c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} Context;
344c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
345c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
346c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
347c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic void U_CALLCONV
348c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumClose(UEnumeration *en) {
349c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(en->context != NULL) {
350c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        DELETE_ARRAY(en->context);
351c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
352c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
353c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    DELETE_ARRAY(en);
354c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
355c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
356c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t U_CALLCONV
357c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumCount(UEnumeration *, UErrorCode *) {
358c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return fCSRecognizers_size;
359c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
360c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
361c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic const char* U_CALLCONV
362c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
363c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
364c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if(resultLength != NULL) {
365c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            *resultLength = 0;
366c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
367c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return NULL;
368c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
369c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
370c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(resultLength != NULL) {
371c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        *resultLength = (int32_t)uprv_strlen(currName);
372c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
373c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    ((Context *)en->context)->currIndex++;
374c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
375c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return currName;
376c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
377c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
378c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic void U_CALLCONV
379c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumReset(UEnumeration *en, UErrorCode *) {
380c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    ((Context *)en->context)->currIndex = 0;
381c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
382c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
383c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic const UEnumeration gCSDetEnumeration = {
384c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    NULL,
385c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    NULL,
386c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    enumClose,
387c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    enumCount,
388c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uenum_unextDefault,
389c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    enumNext,
390c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    enumReset
391c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott};
392c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
393c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CAPI  UEnumeration * U_EXPORT2
394c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
395c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
396c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    U_NAMESPACE_USE
397c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
398c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(U_FAILURE(*status)) {
399c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        return 0;
400c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
401c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
402c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    /* Initialize recognized charsets. */
403c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    CharsetDetector::getDetectableCount();
404c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
405c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
406c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
407c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    en->context = (void*)NEW_ARRAY(Context, 1);
408c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uprv_memset(en->context, 0, sizeof(Context));
409c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return en;
410c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
411c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_END
412c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
413c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif
414c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
415