1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 2005-2007, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h" 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_USE 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array)) 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UCharsetDetector * U_EXPORT2 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_open(UErrorCode *status) 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector* csd = new CharsetDetector(*status); 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(*status)) { 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete csd; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csd = NULL; 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UCharsetDetector *) csd; 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_close(UCharsetDetector *ucsd) 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete csd; 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((CharsetDetector *) ucsd)->setText(textIn, len); 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getName(); 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status) 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getConfidence(); 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const char * U_EXPORT2 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status) 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getLanguage(); 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch * U_EXPORT2 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (const UCharsetMatch *) ((CharsetDetector *) ucsd)->detect(*status); 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI void U_EXPORT2 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status) 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((CharsetDetector *) ucsd)->setDeclaredEncoding(encoding,length); 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI const UCharsetMatch** 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_detectAll(UCharsetDetector *ucsd, 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t *maxMatchesFound, UErrorCode *status) 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (const UCharsetMatch**)csd->detectAll(*maxMatchesFound,*status); 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI const char * U_EXPORT2 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetName(const UCharsetDetector *csd, int32_t index, UErrorCode *status) 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// { 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// if(U_FAILURE(*status)) { 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return 0; 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return csd->getCharsetName(index,*status); 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// U_CAPI int32_t U_EXPORT2 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ucsdet_getDetectableCharsetsCount(const UCharsetDetector *csd, UErrorCode *status) 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// { 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// if(U_FAILURE(*status)) { 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return -1; 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// return UCharsetDetector::getDetectableCount(); 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// } 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd) 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // todo: could use an error return... 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ucsd == NULL) { 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetDetector *) ucsd)->getStripTagsFlag(); 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter) 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // todo: could use an error return... 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ucsd == NULL) { 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector *csd = (CharsetDetector *) ucsd; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool prev = csd->getStripTagsFlag(); 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csd->setStripTagsFlag(filter); 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev; 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getUChars(const UCharsetMatch *ucsm, 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *buf, int32_t cap, UErrorCode *status) 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status); 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 181