1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ********************************************************************** 359d709d503bab6e2b61931737e662dd293b40578ccornelius * Copyright (C) 2005-2013, International Business Machines 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ********************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * file name: ucsdet.h 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * encoding: US-ASCII 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * indentation:4 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * created on: 2005Aug04 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * created by: Andy Heninger 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * ICU Character Set Detection, API for C 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Draft version 18 Oct 2005 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifndef __UCSDET_H 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define __UCSDET_H 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/localpointer.h" 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uenum.h" 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * \file 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * \brief C API: Charset Detection API 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * This API provides a facility for detecting the 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * charset or encoding of character data in an unknown text format. 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The input data can be from an array of bytes. 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Character set detection is at best an imprecise operation. The detection 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * process will attempt to identify the charset that best matches the characteristics 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * of the byte data, but the process is partly statistical in nature, and 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the results can not be guaranteed to always be correct. 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * For best accuracy in charset detection, the input data should be primarily 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * in a single language, and a minimum of a few hundred bytes worth of plain text 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * in the language are needed. The detection process will attempt to 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * ignore html or xml style markup that could otherwise obscure the content. 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UCharsetDetector; 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Structure representing a charset detector 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querutypedef struct UCharsetDetector UCharsetDetector; 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UCharsetMatch; 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Opaque structure representing a match that was identified 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * from a charset detection operation. 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querutypedef struct UCharsetMatch UCharsetMatch; 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Open a charset detector. 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions occurring during the open 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * operation are reported back in this variable. 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return the newly opened charset detector. 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE UCharsetDetector * U_EXPORT2 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_open(UErrorCode *status); 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Close a charset detector. All storage and any other resources 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * owned by this charset detector will be released. Failure to 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * close a charset detector when finished with it can result in 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * memory leaks in the application. 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd The charset detector to be closed. 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE void U_EXPORT2 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_close(UCharsetDetector *ucsd); 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if U_SHOW_CPLUSPLUS_API 8850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 8950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_BEGIN 9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 9150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 9250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * \class LocalUCharsetDetectorPointer 9350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 9450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * For most methods see the LocalPointerBase base class. 9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @see LocalPointerBase 9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @see LocalPointer 9827f654740f2a26ad62a5c155af9199af9e69b889claireho * @stable ICU 4.4 9950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 10050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 10150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 10250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_END 10350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 10450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 10550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Set the input byte data whose charset is to detected. 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Ownership of the input text byte array remains with the caller. 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The input string must not be altered or deleted until the charset 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * detector is either closed or reset to refer to different input text. 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd the charset detector to be used. 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param textIn the input text of unknown encoding. . 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param len the length of the input text, or -1 if the text 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is NUL terminated. 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status any error conditions are reported back in this variable. 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE void U_EXPORT2 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** Set the declared encoding for charset detection. 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The declared encoding of an input text is an encoding obtained 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * by the user from an http header or xml declaration or similar source that 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * can be provided as an additional hint to the charset detector. 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * How and whether the declared encoding will be used during the 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * detection process is TBD. 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd the charset detector to be used. 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param encoding an encoding for the current data obtained from 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * a header or declaration or other source outside 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * of the byte data itself. 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param length the length of the encoding name, or -1 if the name string 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is NUL terminated. 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status any error conditions are reported back in this variable. 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE void U_EXPORT2 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Return the charset that best matches the supplied input data. 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Note though, that because the detection 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * only looks at the start of the input data, 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * there is a possibility that the returned charset will fail to handle 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the full set of input data. 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The returned UCharsetMatch object is owned by the UCharsetDetector. 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * It will remain valid until the detector input is reset, or until 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the detector is closed. 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The function will fail if 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <ul> 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <li>no charset appears to match the data.</li> 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <li>no input text has been provided</li> 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * </ul> 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd the charset detector to be used. 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status any error conditions are reported back in this variable. 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return a UCharsetMatch representing the best matching charset, 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * or NULL if no charset matches the byte data. 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE const UCharsetMatch * U_EXPORT2 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Find all charset matches that appear to be consistent with the input, 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * returning an array of results. The results are ordered with the 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * best quality match first. 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Because the detection only looks at a limited amount of the 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * input byte data, some of the returned charsets may fail to handle 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the all of input data. 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The returned UCharsetMatch objects are owned by the UCharsetDetector. 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * They will remain valid until the detector is closed or modified 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <p> 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Return an error if 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <ul> 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <li>no charsets appear to match the input data.</li> 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * <li>no input text has been provided</li> 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * </ul> 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd the charset detector to be used. 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param matchesFound pointer to a variable that will be set to the 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * number of charsets identified that are consistent with 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the input data. Output only. 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status any error conditions are reported back in this variable. 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return A pointer to an array of pointers to UCharSetMatch objects. 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * This array, and the UCharSetMatch instances to which it refers, 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * are owned by the UCharsetDetector, and will remain valid until 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the detector is closed or modified. 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE const UCharsetMatch ** U_EXPORT2 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Get the name of the charset represented by a UCharsetMatch. 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The storage for the returned name string is owned by the 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * UCharsetMatch, and will remain valid while the UCharsetMatch 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is valid. 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The name returned is suitable for use with the ICU conversion APIs. 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsm The charset match object. 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions are reported back in this variable. 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return The name of the matching charset. 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE const char * U_EXPORT2 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Get a confidence number for the quality of the match of the byte 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * data with the charset. Confidence numbers range from zero to 100, 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * with 100 representing complete confidence and zero representing 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * no confidence. 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The confidence values are somewhat arbitrary. They define an 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * an ordering within the results for any single detection operation 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * but are not generally comparable between the results for different input. 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * A confidence value of ten does have a general meaning - it is used 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * for charsets that can represent the input data, but for which there 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is no other indication that suggests that the charset is the correct one. 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Pure 7 bit ASCII data, for example, is compatible with a 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * great many charsets, most of which will appear as possible matches 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * with a confidence of 10. 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsm The charset match object. 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions are reported back in this variable. 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return A confidence number for the charset match. 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE int32_t U_EXPORT2 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Get the RFC 3066 code for the language of the input data. 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The Charset Detection service is intended primarily for detecting 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * charsets, not language. For some, but not all, charsets, a language is 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * identified as a byproduct of the detection process, and that is what 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is returned by this function. 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * CAUTION: 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 1. Language information is not available for input data encoded in 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * all charsets. In particular, no language is identified 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * for UTF-8 input data. 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 2. Closely related languages may sometimes be confused. 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If more accurate language detection is required, a linguistic 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * analysis package should be used. 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The storage for the returned name string is owned by the 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * UCharsetMatch, and will remain valid while the UCharsetMatch 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * is valid. 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsm The charset match object. 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions are reported back in this variable. 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return The RFC 3066 code for the language of the input data, or 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * an empty string if the language could not be determined. 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE const char * U_EXPORT2 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Get the entire input text as a UChar string, placing it into 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * a caller-supplied buffer. A terminating 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * NUL character will be appended to the buffer if space is available. 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The number of UChars in the output string, not including the terminating 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * NUL, is returned. 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If the supplied buffer is smaller than required to hold the output, 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the contents of the buffer are undefined. The full output string length 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * (in UChars) is returned as always, and can be used to allocate a buffer 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * of the correct size. 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsm The charset match object. 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param buf A UChar buffer to be filled with the converted text data. 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param cap The capacity of the buffer in UChars. 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions are reported back in this variable. 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return The number of UChars in the output string. 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE int32_t U_EXPORT2 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_getUChars(const UCharsetMatch *ucsm, 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *buf, int32_t cap, UErrorCode *status); 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Get an iterator over the set of all detectable charsets - 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * over the charsets that are known to the charset detection 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * service. 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The returned UEnumeration provides access to the names of 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the charsets. 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 32459d709d503bab6e2b61931737e662dd293b40578ccornelius * <p> 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The state of the Charset detector that is passed in does not 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * affect the result of this function, but requiring a valid, open 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * charset detector as a parameter insures that the charset detection 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * service has been safely initialized and that the required detection 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * data is available. 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 33159d709d503bab6e2b61931737e662dd293b40578ccornelius * <p> 33259d709d503bab6e2b61931737e662dd293b40578ccornelius * <b>Note:</b> Multiple different charset encodings in a same family may use 33359d709d503bab6e2b61931737e662dd293b40578ccornelius * a single shared name in this implementation. For example, this method returns 33459d709d503bab6e2b61931737e662dd293b40578ccornelius * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 33559d709d503bab6e2b61931737e662dd293b40578ccornelius * (Windows Latin 1). However, actual detection result could be "windows-1252" 33659d709d503bab6e2b61931737e662dd293b40578ccornelius * when the input data matches Latin 1 code points with any points only available 33759d709d503bab6e2b61931737e662dd293b40578ccornelius * in "windows-1252". 33859d709d503bab6e2b61931737e662dd293b40578ccornelius * 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd a Charset detector. 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param status Any error conditions are reported back in this variable. 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return an iterator providing access to the detectable charset names. 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE UEnumeration * U_EXPORT2 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Test whether input filtering is enabled for this charset detector. 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Input filtering removes text that appears to be HTML or xml 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * markup from the input before applying the code page detection 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * heuristics. 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd The charset detector to check. 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return TRUE if filtering is enabled. 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 35759d709d503bab6e2b61931737e662dd293b40578ccornelius 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE UBool U_EXPORT2 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Enable filtering of input text. If filtering is enabled, 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * text within angle brackets ("<" and ">") will be removed 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * before detection, which will remove most HTML or xml markup. 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param ucsd the charset detector to be modified. 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param filter <code>true</code> to enable input text filtering. 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return The previous setting. 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @stable ICU 3.6 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_STABLE UBool U_EXPORT2 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 37659d709d503bab6e2b61931737e662dd293b40578ccornelius#ifndef U_HIDE_INTERNAL_API 37759d709d503bab6e2b61931737e662dd293b40578ccornelius/** 37859d709d503bab6e2b61931737e662dd293b40578ccornelius * Get an iterator over the set of detectable charsets - 37959d709d503bab6e2b61931737e662dd293b40578ccornelius * over the charsets that are enabled by the specified charset detector. 38059d709d503bab6e2b61931737e662dd293b40578ccornelius * 38159d709d503bab6e2b61931737e662dd293b40578ccornelius * The returned UEnumeration provides access to the names of 38259d709d503bab6e2b61931737e662dd293b40578ccornelius * the charsets. 38359d709d503bab6e2b61931737e662dd293b40578ccornelius * 38459d709d503bab6e2b61931737e662dd293b40578ccornelius * @param ucsd a Charset detector. 38559d709d503bab6e2b61931737e662dd293b40578ccornelius * @param status Any error conditions are reported back in this variable. 38659d709d503bab6e2b61931737e662dd293b40578ccornelius * @return an iterator providing access to the detectable charset names by 38759d709d503bab6e2b61931737e662dd293b40578ccornelius * the specified charset detector. 38859d709d503bab6e2b61931737e662dd293b40578ccornelius * @internal 38959d709d503bab6e2b61931737e662dd293b40578ccornelius */ 39059d709d503bab6e2b61931737e662dd293b40578ccorneliusU_INTERNAL UEnumeration * U_EXPORT2 39159d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 39259d709d503bab6e2b61931737e662dd293b40578ccornelius 39359d709d503bab6e2b61931737e662dd293b40578ccornelius/** 39459d709d503bab6e2b61931737e662dd293b40578ccornelius * Enable or disable individual charset encoding. 39559d709d503bab6e2b61931737e662dd293b40578ccornelius * A name of charset encoding must be included in the names returned by 39659d709d503bab6e2b61931737e662dd293b40578ccornelius * {@link #getAllDetectableCharsets()}. 39759d709d503bab6e2b61931737e662dd293b40578ccornelius * 39859d709d503bab6e2b61931737e662dd293b40578ccornelius * @param ucsd a Charset detector. 39959d709d503bab6e2b61931737e662dd293b40578ccornelius * @param encoding encoding the name of charset encoding. 40059d709d503bab6e2b61931737e662dd293b40578ccornelius * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the 40159d709d503bab6e2b61931737e662dd293b40578ccornelius * charset encoding. 40259d709d503bab6e2b61931737e662dd293b40578ccornelius * @param status receives the return status. When the name of charset encoding 40359d709d503bab6e2b61931737e662dd293b40578ccornelius * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 40459d709d503bab6e2b61931737e662dd293b40578ccornelius * @internal 40559d709d503bab6e2b61931737e662dd293b40578ccornelius */ 40659d709d503bab6e2b61931737e662dd293b40578ccorneliusU_INTERNAL void U_EXPORT2 40759d709d503bab6e2b61931737e662dd293b40578ccorneliusucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 40859d709d503bab6e2b61931737e662dd293b40578ccornelius#endif /* U_HIDE_INTERNAL_API */ 40959d709d503bab6e2b61931737e662dd293b40578ccornelius 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* __UCSDET_H */ 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 414