1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************** 359d709d503bab6e2b61931737e662dd293b40578ccornelius * Copyright (C) 2005-2013, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h" 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h" 1459d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrsbcs.h" 1559d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrmbcs.h" 1659d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrutf8.h" 1759d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csrucode.h" 1859d709d503bab6e2b61931737e662dd293b40578ccornelius#include "csr2022.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_USE 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array)) 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UCharsetDetector * U_EXPORT2 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_open(UErrorCode *status) 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector* csd = new CharsetDetector(*status); 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(*status)) { 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete csd; 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csd = NULL; 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UCharsetDetector *) csd; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_close(UCharsetDetector *ucsd) 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete csd; 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((CharsetDetector *) ucsd)->setText(textIn, len); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getName(); 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status) 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getConfidence(); 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status) 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getLanguage(); 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch * U_EXPORT2 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (const UCharsetMatch *) ((CharsetDetector *) ucsd)->detect(*status); 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status) 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((CharsetDetector *) ucsd)->setDeclaredEncoding(encoding,length); 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch** 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detectAll(UCharsetDetector *ucsd, 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t *maxMatchesFound, UErrorCode *status) 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (const UCharsetMatch**)csd->detectAll(*maxMatchesFound,*status); 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI const char * U_EXPORT2 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetName(const UCharsetDetector *csd, int32_t index, UErrorCode *status) 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// { 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// if(U_FAILURE(*status)) { 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return 0; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return csd->getCharsetName(index,*status); 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI int32_t U_EXPORT2 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetsCount(const UCharsetDetector *csd, UErrorCode *status) 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// { 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// if(U_FAILURE(*status)) { 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return -1; 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return UCharsetDetector::getDetectableCount(); 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd) 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // todo: could use an error return... 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ucsd == NULL) { 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetDetector *) ucsd)->getStripTagsFlag(); 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter) 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // todo: could use an error return... 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ucsd == NULL) { 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool prev = csd->getStripTagsFlag(); 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csd->setStripTagsFlag(filter); 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev; 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getUChars(const UCharsetMatch *ucsm, 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *buf, int32_t cap, UErrorCode *status) 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status); 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 18359d709d503bab6e2b61931737e662dd293b40578ccornelius 18459d709d503bab6e2b61931737e662dd293b40578ccorneliusU_CAPI void U_EXPORT2 18559d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status) 18659d709d503bab6e2b61931737e662dd293b40578ccornelius{ 18759d709d503bab6e2b61931737e662dd293b40578ccornelius ((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status); 18859d709d503bab6e2b61931737e662dd293b40578ccornelius} 18959d709d503bab6e2b61931737e662dd293b40578ccornelius 19059d709d503bab6e2b61931737e662dd293b40578ccorneliusU_CAPI UEnumeration * U_EXPORT2 19159d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 19259d709d503bab6e2b61931737e662dd293b40578ccornelius{ 19359d709d503bab6e2b61931737e662dd293b40578ccornelius return CharsetDetector::getAllDetectableCharsets(*status); 19459d709d503bab6e2b61931737e662dd293b40578ccornelius} 19559d709d503bab6e2b61931737e662dd293b40578ccornelius 19659d709d503bab6e2b61931737e662dd293b40578ccorneliusU_DRAFT UEnumeration * U_EXPORT2 19759d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) 19859d709d503bab6e2b61931737e662dd293b40578ccornelius{ 19959d709d503bab6e2b61931737e662dd293b40578ccornelius return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status); 20059d709d503bab6e2b61931737e662dd293b40578ccornelius} 20159d709d503bab6e2b61931737e662dd293b40578ccornelius 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20459d709d503bab6e2b61931737e662dd293b40578ccornelius 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 206