1/*
2**********************************************************************
3*   Copyright (C) 1999-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6 *
7 *
8 *   ucnv_err.h:
9 */
10
11/**
12 * \file
13 * \brief C UConverter predefined error callbacks
14 *
15 *  <h2>Error Behaviour Functions</h2>
16 *  Defines some error behaviour functions called by ucnv_{from,to}Unicode
17 *  These are provided as part of ICU and many are stable, but they
18 *  can also be considered only as an example of what can be done with
19 *  callbacks.  You may of course write your own.
20 *
21 *  If you want to write your own, you may also find the functions from
22 *  ucnv_cb.h useful when writing your own callbacks.
23 *
24 *  These functions, although public, should NEVER be called directly.
25 *  They should be used as parameters to the ucnv_setFromUCallback
26 *  and ucnv_setToUCallback functions, to set the behaviour of a converter
27 *  when it encounters ILLEGAL/UNMAPPED/INVALID sequences.
28 *
29 *  usage example:  'STOP' doesn't need any context, but newContext
30 *    could be set to something other than 'NULL' if needed. The available
31 *    contexts in this header can modify the default behavior of the callback.
32 *
33 *  \code
34 *  UErrorCode err = U_ZERO_ERROR;
35 *  UConverter *myConverter = ucnv_open("ibm-949", &err);
36 *  const void *oldContext;
37 *  UConverterFromUCallback oldAction;
38 *
39 *
40 *  if (U_SUCCESS(err))
41 *  {
42 *      ucnv_setFromUCallBack(myConverter,
43 *                       UCNV_FROM_U_CALLBACK_STOP,
44 *                       NULL,
45 *                       &oldAction,
46 *                       &oldContext,
47 *                       &status);
48 *  }
49 *  \endcode
50 *
51 *  The code above tells "myConverter" to stop when it encounters an
52 *  ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
53 *  Unicode -> Codepage. The behavior from Codepage to Unicode is not changed,
54 *  and ucnv_setToUCallBack would need to be called in order to change
55 *  that behavior too.
56 *
57 *  Here is an example with a context:
58 *
59 *  \code
60 *  UErrorCode err = U_ZERO_ERROR;
61 *  UConverter *myConverter = ucnv_open("ibm-949", &err);
62 *  const void *oldContext;
63 *  UConverterFromUCallback oldAction;
64 *
65 *
66 *  if (U_SUCCESS(err))
67 *  {
68 *      ucnv_setToUCallBack(myConverter,
69 *                       UCNV_TO_U_CALLBACK_SUBSTITUTE,
70 *                       UCNV_SUB_STOP_ON_ILLEGAL,
71 *                       &oldAction,
72 *                       &oldContext,
73 *                       &status);
74 *  }
75 *  \endcode
76 *
77 *  The code above tells "myConverter" to stop when it encounters an
78 *  ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
79 *  Codepage -> Unicode. Any unmapped and legal characters will be
80 *  substituted to be the default substitution character.
81 */
82
83#ifndef UCNV_ERR_H
84#define UCNV_ERR_H
85
86#include "unicode/utypes.h"
87
88#if !UCONFIG_NO_CONVERSION
89
90/** Forward declaring the UConverter structure. @stable ICU 2.0 */
91struct UConverter;
92
93/** @stable ICU 2.0 */
94typedef struct UConverter UConverter;
95
96/**
97 * FROM_U, TO_U context options for sub callback
98 * @stable ICU 2.0
99 */
100#define UCNV_SUB_STOP_ON_ILLEGAL "i"
101
102/**
103 * FROM_U, TO_U context options for skip callback
104 * @stable ICU 2.0
105 */
106#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
107
108/**
109 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
110 * @stable ICU 2.0
111 */
112#define UCNV_ESCAPE_ICU       NULL
113/**
114 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
115 * @stable ICU 2.0
116 */
117#define UCNV_ESCAPE_JAVA      "J"
118/**
119 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
120 * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
121 * @stable ICU 2.0
122 */
123#define UCNV_ESCAPE_C         "C"
124/**
125 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
126 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
127 * @stable ICU 2.0
128 */
129#define UCNV_ESCAPE_XML_DEC   "D"
130/**
131 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
132 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
133 * @stable ICU 2.0
134 */
135#define UCNV_ESCAPE_XML_HEX   "X"
136/**
137 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
138 * @stable ICU 2.0
139 */
140#define UCNV_ESCAPE_UNICODE   "U"
141
142/**
143 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is,
144 * a backslash, 1..6 hex digits, and a space)
145 * @stable ICU 4.0
146 */
147#define UCNV_ESCAPE_CSS2   "S"
148
149/**
150 * The process condition code to be used with the callbacks.
151 * Codes which are greater than UCNV_IRREGULAR should be
152 * passed on to any chained callbacks.
153 * @stable ICU 2.0
154 */
155typedef enum {
156    UCNV_UNASSIGNED = 0,  /**< The code point is unassigned.
157                             The error code U_INVALID_CHAR_FOUND will be set. */
158    UCNV_ILLEGAL = 1,     /**< The code point is illegal. For example,
159                             \\x81\\x2E is illegal in SJIS because \\x2E
160                             is not a valid trail byte for the \\x81
161                             lead byte.
162                             Also, starting with Unicode 3.0.1, non-shortest byte sequences
163                             in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
164                             are also illegal, not just irregular.
165                             The error code U_ILLEGAL_CHAR_FOUND will be set. */
166    UCNV_IRREGULAR = 2,   /**< The codepoint is not a regular sequence in
167                             the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
168                             are irregular UTF-8 byte sequences for single surrogate
169                             code points.
170                             The error code U_INVALID_CHAR_FOUND will be set. */
171    UCNV_RESET = 3,       /**< The callback is called with this reason when a
172                             'reset' has occured. Callback should reset all
173                             state. */
174    UCNV_CLOSE = 4,        /**< Called when the converter is closed. The
175                             callback should release any allocated memory.*/
176    UCNV_CLONE = 5         /**< Called when ucnv_safeClone() is called on the
177                              converter. the pointer available as the
178                              'context' is an alias to the original converters'
179                              context pointer. If the context must be owned
180                              by the new converter, the callback must clone
181                              the data and call ucnv_setFromUCallback
182                              (or setToUCallback) with the correct pointer.
183                              @stable ICU 2.2
184                           */
185} UConverterCallbackReason;
186
187
188/**
189 * The structure for the fromUnicode callback function parameter.
190 * @stable ICU 2.0
191 */
192typedef struct {
193    uint16_t size;              /**< The size of this struct. @stable ICU 2.0 */
194    UBool flush;                /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0    */
195    UConverter *converter;      /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0  */
196    const UChar *source;        /**< Pointer to the source source buffer. @stable ICU 2.0    */
197    const UChar *sourceLimit;   /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0    */
198    char *target;               /**< Pointer to the target buffer. @stable ICU 2.0    */
199    const char *targetLimit;    /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0     */
200    int32_t *offsets;           /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0  */
201} UConverterFromUnicodeArgs;
202
203
204/**
205 * The structure for the toUnicode callback function parameter.
206 * @stable ICU 2.0
207 */
208typedef struct {
209    uint16_t size;              /**< The size of this struct   @stable ICU 2.0 */
210    UBool flush;                /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0   */
211    UConverter *converter;      /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
212    const char *source;         /**< Pointer to the source source buffer. @stable ICU 2.0    */
213    const char *sourceLimit;    /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0    */
214    UChar *target;              /**< Pointer to the target buffer. @stable ICU 2.0    */
215    const UChar *targetLimit;   /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0     */
216    int32_t *offsets;           /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0  */
217} UConverterToUnicodeArgs;
218
219
220/**
221 * DO NOT CALL THIS FUNCTION DIRECTLY!
222 * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE,
223 * returning the error code back to the caller immediately.
224 *
225 * @param context Pointer to the callback's private data
226 * @param fromUArgs Information about the conversion in progress
227 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
228 * @param length Size (in bytes) of the concerned codepage sequence
229 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
230 * @param reason Defines the reason the callback was invoked
231 * @param err This should always be set to a failure status prior to calling.
232 * @stable ICU 2.0
233 */
234U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
235                  const void *context,
236                  UConverterFromUnicodeArgs *fromUArgs,
237                  const UChar* codeUnits,
238                  int32_t length,
239                  UChar32 codePoint,
240                  UConverterCallbackReason reason,
241                  UErrorCode * err);
242
243
244
245/**
246 * DO NOT CALL THIS FUNCTION DIRECTLY!
247 * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
248 * returning the error code back to the caller immediately.
249 *
250 * @param context Pointer to the callback's private data
251 * @param toUArgs Information about the conversion in progress
252 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
253 * @param length Size (in bytes) of the concerned codepage sequence
254 * @param reason Defines the reason the callback was invoked
255 * @param err This should always be set to a failure status prior to calling.
256 * @stable ICU 2.0
257 */
258U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
259                  const void *context,
260                  UConverterToUnicodeArgs *toUArgs,
261                  const char* codeUnits,
262                  int32_t length,
263                  UConverterCallbackReason reason,
264                  UErrorCode * err);
265
266/**
267 * DO NOT CALL THIS FUNCTION DIRECTLY!
268 * This From Unicode callback skips any ILLEGAL_SEQUENCE, or
269 * skips only UNASSINGED_SEQUENCE depending on the context parameter
270 * simply ignoring those characters.
271 *
272 * @param context  The function currently recognizes the callback options:
273 *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
274 *                      returning the error code back to the caller immediately.
275 *                 NULL: Skips any ILLEGAL_SEQUENCE
276 * @param fromUArgs Information about the conversion in progress
277 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
278 * @param length Size (in bytes) of the concerned codepage sequence
279 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
280 * @param reason Defines the reason the callback was invoked
281 * @param err Return value will be set to success if the callback was handled,
282 *      otherwise this value will be set to a failure status.
283 * @stable ICU 2.0
284 */
285U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
286                  const void *context,
287                  UConverterFromUnicodeArgs *fromUArgs,
288                  const UChar* codeUnits,
289                  int32_t length,
290                  UChar32 codePoint,
291                  UConverterCallbackReason reason,
292                  UErrorCode * err);
293
294/**
295 * DO NOT CALL THIS FUNCTION DIRECTLY!
296 * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
297 * UNASSIGNED_SEQUENCE depending on context parameter, with the
298 * current substitution string for the converter. This is the default
299 * callback.
300 *
301 * @param context The function currently recognizes the callback options:
302 *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
303 *                      returning the error code back to the caller immediately.
304 *                 NULL: Substitutes any ILLEGAL_SEQUENCE
305 * @param fromUArgs Information about the conversion in progress
306 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
307 * @param length Size (in bytes) of the concerned codepage sequence
308 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
309 * @param reason Defines the reason the callback was invoked
310 * @param err Return value will be set to success if the callback was handled,
311 *      otherwise this value will be set to a failure status.
312 * @see ucnv_setSubstChars
313 * @stable ICU 2.0
314 */
315U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
316                  const void *context,
317                  UConverterFromUnicodeArgs *fromUArgs,
318                  const UChar* codeUnits,
319                  int32_t length,
320                  UChar32 codePoint,
321                  UConverterCallbackReason reason,
322                  UErrorCode * err);
323
324/**
325 * DO NOT CALL THIS FUNCTION DIRECTLY!
326 * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
327 * hexadecimal representation of the illegal codepoints
328 *
329 * @param context The function currently recognizes the callback options:
330 *        <ul>
331 *        <li>UCNV_ESCAPE_ICU: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
332 *          representation in the format  %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
333 *          In the Event the converter doesn't support the characters {%,U}[A-F][0-9],
334 *          it will  substitute  the illegal sequence with the substitution characters.
335 *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
336 *          %UD84D%UDC56</li>
337 *        <li>UCNV_ESCAPE_JAVA: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
338 *          representation in the format  \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
339 *          In the Event the converter doesn't support the characters {\,u}[A-F][0-9],
340 *          it will  substitute  the illegal sequence with the substitution characters.
341 *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
342 *          \\uD84D\\uDC56</li>
343 *        <li>UCNV_ESCAPE_C: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
344 *          representation in the format  \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
345 *          In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9],
346 *          it will  substitute  the illegal sequence with the substitution characters.
347 *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
348 *          \\U00023456</li>
349 *        <li>UCNV_ESCAPE_XML_DEC: Substitues the  ILLEGAL SEQUENCE with the decimal
350 *          representation in the format \htmlonly&amp;#DDDDDDDD;, e.g. "&amp;#65534;&amp;#172;&amp;#51454;")\endhtmlonly.
351 *          In the Event the converter doesn't support the characters {&amp;,#}[0-9],
352 *          it will  substitute  the illegal sequence with the substitution characters.
353 *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
354 *          &amp;#144470; and Zero padding is ignored.</li>
355 *        <li>UCNV_ESCAPE_XML_HEX:Substitues the  ILLEGAL SEQUENCE with the decimal
356 *          representation in the format \htmlonly&amp;#xXXXX; e.g. "&amp;#xFFFE;&amp;#x00AC;&amp;#xC8FE;")\endhtmlonly.
357 *          In the Event the converter doesn't support the characters {&,#,x}[0-9],
358 *          it will  substitute  the illegal sequence with the substitution characters.
359 *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
360 *          \htmlonly&amp;#x23456;\endhtmlonly</li>
361 *        </ul>
362 * @param fromUArgs Information about the conversion in progress
363 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
364 * @param length Size (in bytes) of the concerned codepage sequence
365 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
366 * @param reason Defines the reason the callback was invoked
367 * @param err Return value will be set to success if the callback was handled,
368 *      otherwise this value will be set to a failure status.
369 * @stable ICU 2.0
370 */
371U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
372                  const void *context,
373                  UConverterFromUnicodeArgs *fromUArgs,
374                  const UChar* codeUnits,
375                  int32_t length,
376                  UChar32 codePoint,
377                  UConverterCallbackReason reason,
378                  UErrorCode * err);
379
380
381/**
382 * DO NOT CALL THIS FUNCTION DIRECTLY!
383 * This To Unicode callback skips any ILLEGAL_SEQUENCE, or
384 * skips only UNASSINGED_SEQUENCE depending on the context parameter
385 * simply ignoring those characters.
386 *
387 * @param context  The function currently recognizes the callback options:
388 *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
389 *                      returning the error code back to the caller immediately.
390 *                 NULL: Skips any ILLEGAL_SEQUENCE
391 * @param toUArgs Information about the conversion in progress
392 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
393 * @param length Size (in bytes) of the concerned codepage sequence
394 * @param reason Defines the reason the callback was invoked
395 * @param err Return value will be set to success if the callback was handled,
396 *      otherwise this value will be set to a failure status.
397 * @stable ICU 2.0
398 */
399U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
400                  const void *context,
401                  UConverterToUnicodeArgs *toUArgs,
402                  const char* codeUnits,
403                  int32_t length,
404                  UConverterCallbackReason reason,
405                  UErrorCode * err);
406
407/**
408 * DO NOT CALL THIS FUNCTION DIRECTLY!
409 * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
410 * UNASSIGNED_SEQUENCE depending on context parameter,  with the
411 * Unicode substitution character, U+FFFD.
412 *
413 * @param context  The function currently recognizes the callback options:
414 *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
415 *                      returning the error code back to the caller immediately.
416 *                 NULL: Substitutes any ILLEGAL_SEQUENCE
417 * @param toUArgs Information about the conversion in progress
418 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
419 * @param length Size (in bytes) of the concerned codepage sequence
420 * @param reason Defines the reason the callback was invoked
421 * @param err Return value will be set to success if the callback was handled,
422 *      otherwise this value will be set to a failure status.
423 * @stable ICU 2.0
424 */
425U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
426                  const void *context,
427                  UConverterToUnicodeArgs *toUArgs,
428                  const char* codeUnits,
429                  int32_t length,
430                  UConverterCallbackReason reason,
431                  UErrorCode * err);
432
433/**
434 * DO NOT CALL THIS FUNCTION DIRECTLY!
435 * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
436 * hexadecimal representation of the illegal bytes
437 *  (in the format  %XNN, e.g. "%XFF%X0A%XC8%X03").
438 *
439 * @param context This function currently recognizes the callback options:
440 *      UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC,
441 *      UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE.
442 * @param toUArgs Information about the conversion in progress
443 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
444 * @param length Size (in bytes) of the concerned codepage sequence
445 * @param reason Defines the reason the callback was invoked
446 * @param err Return value will be set to success if the callback was handled,
447 *      otherwise this value will be set to a failure status.
448 * @stable ICU 2.0
449 */
450
451U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE (
452                  const void *context,
453                  UConverterToUnicodeArgs *toUArgs,
454                  const char* codeUnits,
455                  int32_t length,
456                  UConverterCallbackReason reason,
457                  UErrorCode * err);
458
459#endif
460
461#endif
462
463/*UCNV_ERR_H*/
464