1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucsdet.h
7 *   encoding:   US-ASCII
8 *   indentation:4
9 *
10 *   created on: 2005Aug04
11 *   created by: Andy Heninger
12 *
13 *   ICU Character Set Detection, API for C
14 *
15 *   Draft version 18 Oct 2005
16 *
17 */
18
19#ifndef __UCSDET_H
20#define __UCSDET_H
21
22#include "unicode/utypes.h"
23
24#if !UCONFIG_NO_CONVERSION
25
26#include "unicode/localpointer.h"
27#include "unicode/uenum.h"
28
29/**
30 * \file
31 * \brief C API: Charset Detection API
32 *
33 * This API provides a facility for detecting the
34 * charset or encoding of character data in an unknown text format.
35 * The input data can be from an array of bytes.
36 * <p>
37 * Character set detection is at best an imprecise operation.  The detection
38 * process will attempt to identify the charset that best matches the characteristics
39 * of the byte data, but the process is partly statistical in nature, and
40 * the results can not be guaranteed to always be correct.
41 * <p>
42 * For best accuracy in charset detection, the input data should be primarily
43 * in a single language, and a minimum of a few hundred bytes worth of plain text
44 * in the language are needed.  The detection process will attempt to
45 * ignore html or xml style markup that could otherwise obscure the content.
46 */
47
48
49struct UCharsetDetector;
50/**
51  * Structure representing a charset detector
52  * @stable ICU 3.6
53  */
54typedef struct UCharsetDetector UCharsetDetector;
55
56struct UCharsetMatch;
57/**
58  *  Opaque structure representing a match that was identified
59  *  from a charset detection operation.
60  *  @stable ICU 3.6
61  */
62typedef struct UCharsetMatch UCharsetMatch;
63
64/**
65  *  Open a charset detector.
66  *
67  *  @param status Any error conditions occurring during the open
68  *                operation are reported back in this variable.
69  *  @return the newly opened charset detector.
70  *  @stable ICU 3.6
71  */
72U_STABLE UCharsetDetector * U_EXPORT2
73ucsdet_open(UErrorCode   *status);
74
75/**
76  * Close a charset detector.  All storage and any other resources
77  *   owned by this charset detector will be released.  Failure to
78  *   close a charset detector when finished with it can result in
79  *   memory leaks in the application.
80  *
81  *  @param ucsd  The charset detector to be closed.
82  *  @stable ICU 3.6
83  */
84U_STABLE void U_EXPORT2
85ucsdet_close(UCharsetDetector *ucsd);
86
87#if U_SHOW_CPLUSPLUS_API
88
89U_NAMESPACE_BEGIN
90
91/**
92 * \class LocalUCharsetDetectorPointer
93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
94 * For most methods see the LocalPointerBase base class.
95 *
96 * @see LocalPointerBase
97 * @see LocalPointer
98 * @stable ICU 4.4
99 */
100U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
101
102U_NAMESPACE_END
103
104#endif
105
106/**
107  * Set the input byte data whose charset is to detected.
108  *
109  * Ownership of the input  text byte array remains with the caller.
110  * The input string must not be altered or deleted until the charset
111  * detector is either closed or reset to refer to different input text.
112  *
113  * @param ucsd   the charset detector to be used.
114  * @param textIn the input text of unknown encoding.   .
115  * @param len    the length of the input text, or -1 if the text
116  *               is NUL terminated.
117  * @param status any error conditions are reported back in this variable.
118  *
119  * @stable ICU 3.6
120  */
121U_STABLE void U_EXPORT2
122ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
123
124
125/** Set the declared encoding for charset detection.
126 *  The declared encoding of an input text is an encoding obtained
127 *  by the user from an http header or xml declaration or similar source that
128 *  can be provided as an additional hint to the charset detector.
129 *
130 *  How and whether the declared encoding will be used during the
131 *  detection process is TBD.
132 *
133 * @param ucsd      the charset detector to be used.
134 * @param encoding  an encoding for the current data obtained from
135 *                  a header or declaration or other source outside
136 *                  of the byte data itself.
137 * @param length    the length of the encoding name, or -1 if the name string
138 *                  is NUL terminated.
139 * @param status    any error conditions are reported back in this variable.
140 *
141 * @stable ICU 3.6
142 */
143U_STABLE void U_EXPORT2
144ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
145
146
147/**
148 * Return the charset that best matches the supplied input data.
149 *
150 * Note though, that because the detection
151 * only looks at the start of the input data,
152 * there is a possibility that the returned charset will fail to handle
153 * the full set of input data.
154 * <p>
155 * The returned UCharsetMatch object is owned by the UCharsetDetector.
156 * It will remain valid until the detector input is reset, or until
157 * the detector is closed.
158 * <p>
159 * The function will fail if
160 *  <ul>
161 *    <li>no charset appears to match the data.</li>
162 *    <li>no input text has been provided</li>
163 *  </ul>
164 *
165 * @param ucsd      the charset detector to be used.
166 * @param status    any error conditions are reported back in this variable.
167 * @return          a UCharsetMatch  representing the best matching charset,
168 *                  or NULL if no charset matches the byte data.
169 *
170 * @stable ICU 3.6
171 */
172U_STABLE const UCharsetMatch * U_EXPORT2
173ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
174
175
176/**
177 *  Find all charset matches that appear to be consistent with the input,
178 *  returning an array of results.  The results are ordered with the
179 *  best quality match first.
180 *
181 *  Because the detection only looks at a limited amount of the
182 *  input byte data, some of the returned charsets may fail to handle
183 *  the all of input data.
184 *  <p>
185 *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
186 *  They will remain valid until the detector is closed or modified
187 *
188 * <p>
189 * Return an error if
190 *  <ul>
191 *    <li>no charsets appear to match the input data.</li>
192 *    <li>no input text has been provided</li>
193 *  </ul>
194 *
195 * @param ucsd          the charset detector to be used.
196 * @param matchesFound  pointer to a variable that will be set to the
197 *                      number of charsets identified that are consistent with
198 *                      the input data.  Output only.
199 * @param status        any error conditions are reported back in this variable.
200 * @return              A pointer to an array of pointers to UCharSetMatch objects.
201 *                      This array, and the UCharSetMatch instances to which it refers,
202 *                      are owned by the UCharsetDetector, and will remain valid until
203 *                      the detector is closed or modified.
204 * @stable ICU 3.6
205 */
206U_STABLE const UCharsetMatch ** U_EXPORT2
207ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
208
209
210
211/**
212 *  Get the name of the charset represented by a UCharsetMatch.
213 *
214 *  The storage for the returned name string is owned by the
215 *  UCharsetMatch, and will remain valid while the UCharsetMatch
216 *  is valid.
217 *
218 *  The name returned is suitable for use with the ICU conversion APIs.
219 *
220 *  @param ucsm    The charset match object.
221 *  @param status  Any error conditions are reported back in this variable.
222 *  @return        The name of the matching charset.
223 *
224 *  @stable ICU 3.6
225 */
226U_STABLE const char * U_EXPORT2
227ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
228
229/**
230 *  Get a confidence number for the quality of the match of the byte
231 *  data with the charset.  Confidence numbers range from zero to 100,
232 *  with 100 representing complete confidence and zero representing
233 *  no confidence.
234 *
235 *  The confidence values are somewhat arbitrary.  They define an
236 *  an ordering within the results for any single detection operation
237 *  but are not generally comparable between the results for different input.
238 *
239 *  A confidence value of ten does have a general meaning - it is used
240 *  for charsets that can represent the input data, but for which there
241 *  is no other indication that suggests that the charset is the correct one.
242 *  Pure 7 bit ASCII data, for example, is compatible with a
243 *  great many charsets, most of which will appear as possible matches
244 *  with a confidence of 10.
245 *
246 *  @param ucsm    The charset match object.
247 *  @param status  Any error conditions are reported back in this variable.
248 *  @return        A confidence number for the charset match.
249 *
250 *  @stable ICU 3.6
251 */
252U_STABLE int32_t U_EXPORT2
253ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
254
255/**
256 *  Get the RFC 3066 code for the language of the input data.
257 *
258 *  The Charset Detection service is intended primarily for detecting
259 *  charsets, not language.  For some, but not all, charsets, a language is
260 *  identified as a byproduct of the detection process, and that is what
261 *  is returned by this function.
262 *
263 *  CAUTION:
264 *    1.  Language information is not available for input data encoded in
265 *        all charsets. In particular, no language is identified
266 *        for UTF-8 input data.
267 *
268 *    2.  Closely related languages may sometimes be confused.
269 *
270 *  If more accurate language detection is required, a linguistic
271 *  analysis package should be used.
272 *
273 *  The storage for the returned name string is owned by the
274 *  UCharsetMatch, and will remain valid while the UCharsetMatch
275 *  is valid.
276 *
277 *  @param ucsm    The charset match object.
278 *  @param status  Any error conditions are reported back in this variable.
279 *  @return        The RFC 3066 code for the language of the input data, or
280 *                 an empty string if the language could not be determined.
281 *
282 *  @stable ICU 3.6
283 */
284U_STABLE const char * U_EXPORT2
285ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
286
287
288/**
289  *  Get the entire input text as a UChar string, placing it into
290  *  a caller-supplied buffer.  A terminating
291  *  NUL character will be appended to the buffer if space is available.
292  *
293  *  The number of UChars in the output string, not including the terminating
294  *  NUL, is returned.
295  *
296  *  If the supplied buffer is smaller than required to hold the output,
297  *  the contents of the buffer are undefined.  The full output string length
298  *  (in UChars) is returned as always, and can be used to allocate a buffer
299  *  of the correct size.
300  *
301  *
302  * @param ucsm    The charset match object.
303  * @param buf     A UChar buffer to be filled with the converted text data.
304  * @param cap     The capacity of the buffer in UChars.
305  * @param status  Any error conditions are reported back in this variable.
306  * @return        The number of UChars in the output string.
307  *
308  * @stable ICU 3.6
309  */
310U_STABLE  int32_t U_EXPORT2
311ucsdet_getUChars(const UCharsetMatch *ucsm,
312                 UChar *buf, int32_t cap, UErrorCode *status);
313
314
315
316/**
317  *  Get an iterator over the set of all detectable charsets -
318  *  over the charsets that are known to the charset detection
319  *  service.
320  *
321  *  The returned UEnumeration provides access to the names of
322  *  the charsets.
323  *
324  *  <p>
325  *  The state of the Charset detector that is passed in does not
326  *  affect the result of this function, but requiring a valid, open
327  *  charset detector as a parameter insures that the charset detection
328  *  service has been safely initialized and that the required detection
329  *  data is available.
330  *
331  *  <p>
332  *  <b>Note:</b> Multiple different charset encodings in a same family may use
333  *  a single shared name in this implementation. For example, this method returns
334  *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
335  *  (Windows Latin 1). However, actual detection result could be "windows-1252"
336  *  when the input data matches Latin 1 code points with any points only available
337  *  in "windows-1252".
338  *
339  *  @param ucsd a Charset detector.
340  *  @param status  Any error conditions are reported back in this variable.
341  *  @return an iterator providing access to the detectable charset names.
342  *  @stable ICU 3.6
343  */
344U_STABLE  UEnumeration * U_EXPORT2
345ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
346
347/**
348  *  Test whether input filtering is enabled for this charset detector.
349  *  Input filtering removes text that appears to be HTML or xml
350  *  markup from the input before applying the code page detection
351  *  heuristics.
352  *
353  *  @param ucsd  The charset detector to check.
354  *  @return TRUE if filtering is enabled.
355  *  @stable ICU 3.6
356  */
357
358U_STABLE  UBool U_EXPORT2
359ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
360
361
362/**
363 * Enable filtering of input text. If filtering is enabled,
364 * text within angle brackets ("<" and ">") will be removed
365 * before detection, which will remove most HTML or xml markup.
366 *
367 * @param ucsd   the charset detector to be modified.
368 * @param filter <code>true</code> to enable input text filtering.
369 * @return The previous setting.
370 *
371 * @stable ICU 3.6
372 */
373U_STABLE  UBool U_EXPORT2
374ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
375
376#ifndef U_HIDE_INTERNAL_API
377/**
378  *  Get an iterator over the set of detectable charsets -
379  *  over the charsets that are enabled by the specified charset detector.
380  *
381  *  The returned UEnumeration provides access to the names of
382  *  the charsets.
383  *
384  *  @param ucsd a Charset detector.
385  *  @param status  Any error conditions are reported back in this variable.
386  *  @return an iterator providing access to the detectable charset names by
387  *  the specified charset detector.
388  *  @internal
389  */
390U_INTERNAL UEnumeration * U_EXPORT2
391ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
392
393/**
394  * Enable or disable individual charset encoding.
395  * A name of charset encoding must be included in the names returned by
396  * {@link #getAllDetectableCharsets()}.
397  *
398  * @param ucsd a Charset detector.
399  * @param encoding encoding the name of charset encoding.
400  * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
401  *   charset encoding.
402  * @param status receives the return status. When the name of charset encoding
403  *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
404  * @internal
405  */
406U_INTERNAL void U_EXPORT2
407ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
408#endif  /* U_HIDE_INTERNAL_API */
409
410#endif
411#endif   /* __UCSDET_H */
412
413
414