1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************************
359d709d503bab6e2b61931737e662dd293b40578ccornelius *   Copyright (C) 2005-2013, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h"
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h"
1459d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrsbcs.h"
1559d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrmbcs.h"
1659d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrutf8.h"
1759d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrucode.h"
1859d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csr2022.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_USE
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array))
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UCharsetDetector * U_EXPORT2
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_open(UErrorCode   *status)
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector* csd = new CharsetDetector(*status);
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(*status)) {
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete csd;
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        csd = NULL;
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (UCharsetDetector *) csd;
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_close(UCharsetDetector *ucsd)
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector *csd = (CharsetDetector *) ucsd;
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete csd;
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status)
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((CharsetDetector *) ucsd)->setText(textIn, len);
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status)
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return ((CharsetMatch *) ucsm)->getName();
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return ((CharsetMatch *) ucsm)->getConfidence();
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status)
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return ((CharsetMatch *) ucsm)->getLanguage();
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch * U_EXPORT2
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status)
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (const UCharsetMatch *) ((CharsetDetector *) ucsd)->detect(*status);
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status)
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((CharsetDetector *) ucsd)->setDeclaredEncoding(encoding,length);
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch**
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detectAll(UCharsetDetector *ucsd,
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                 int32_t *maxMatchesFound, UErrorCode *status)
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector *csd = (CharsetDetector *) ucsd;
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (const UCharsetMatch**)csd->detectAll(*maxMatchesFound,*status);
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI  const char * U_EXPORT2
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetName(const UCharsetDetector *csd, int32_t index, UErrorCode *status)
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// {
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     if(U_FAILURE(*status)) {
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//         return 0;
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     }
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     return csd->getCharsetName(index,*status);
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// }
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI  int32_t U_EXPORT2
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetsCount(const UCharsetDetector *csd, UErrorCode *status)
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// {
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     if(U_FAILURE(*status)) {
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//         return -1;
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     }
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     return UCharsetDetector::getDetectableCount();
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// }
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI  UBool U_EXPORT2
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // todo: could use an error return...
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (ucsd == NULL) {
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return ((CharsetDetector *) ucsd)->getStripTagsFlag();
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI  UBool U_EXPORT2
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter)
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // todo: could use an error return...
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (ucsd == NULL) {
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    CharsetDetector *csd = (CharsetDetector *) ucsd;
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool prev = csd->getStripTagsFlag();
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    csd->setStripTagsFlag(filter);
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return prev;
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI  int32_t U_EXPORT2
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getUChars(const UCharsetMatch *ucsm,
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                 UChar *buf, int32_t cap, UErrorCode *status)
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(*status)) {
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status);
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
18359d709d503bab6e2b61931737e662dd293b40578ccornelius
18459d709d503bab6e2b61931737e662dd293b40578ccorneliusU_CAPI void U_EXPORT2
18559d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status)
18659d709d503bab6e2b61931737e662dd293b40578ccornelius{
18759d709d503bab6e2b61931737e662dd293b40578ccornelius    ((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status);
18859d709d503bab6e2b61931737e662dd293b40578ccornelius}
18959d709d503bab6e2b61931737e662dd293b40578ccornelius
19059d709d503bab6e2b61931737e662dd293b40578ccorneliusU_CAPI  UEnumeration * U_EXPORT2
19159d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
19259d709d503bab6e2b61931737e662dd293b40578ccornelius{
19359d709d503bab6e2b61931737e662dd293b40578ccornelius    return CharsetDetector::getAllDetectableCharsets(*status);
19459d709d503bab6e2b61931737e662dd293b40578ccornelius}
19559d709d503bab6e2b61931737e662dd293b40578ccornelius
19659d709d503bab6e2b61931737e662dd293b40578ccorneliusU_DRAFT UEnumeration * U_EXPORT2
19759d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status)
19859d709d503bab6e2b61931737e662dd293b40578ccornelius{
19959d709d503bab6e2b61931737e662dd293b40578ccornelius    return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status);
20059d709d503bab6e2b61931737e662dd293b40578ccornelius}
20159d709d503bab6e2b61931737e662dd293b40578ccornelius
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20459d709d503bab6e2b61931737e662dd293b40578ccornelius
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
206