1/*
2**********************************************************************
3*   Copyright (C) 2000-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv2022.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2000feb03
12*   created by: Markus W. Scherer
13*
14*   Change history:
15*
16*   06/29/2000  helena  Major rewrite of the callback APIs.
17*   08/08/2000  Ram     Included support for ISO-2022-JP-2
18*                       Changed implementation of toUnicode
19*                       function
20*   08/21/2000  Ram     Added support for ISO-2022-KR
21*   08/29/2000  Ram     Seperated implementation of EBCDIC to
22*                       ucnvebdc.c
23*   09/20/2000  Ram     Added support for ISO-2022-CN
24*                       Added implementations for getNextUChar()
25*                       for specific 2022 country variants.
26*   10/31/2000  Ram     Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
37#include "ucnv_imp.h"
38#include "ucnv_bld.h"
39#include "ucnv_cnv.h"
40#include "ucnvmbcs.h"
41#include "cstring.h"
42#include "cmemory.h"
43
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 *    its designation sequences, single shifts with return to the previous state,
54 *    switch-with-no-return to UTF-16BE or similar, etc.
55 *    This is unlike the language-specific variants like ISO-2022-JP which
56 *    require a much smaller repertoire of ISO-2022 features.
57 *    These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 *    but rather always one of the language-specific variants.
60 *    Note that ICU's generic ISO-2022 converter has always output one escape
61 *    sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 *    the previous converter is closed and a new one opened,
64 *    without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 *    reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 *    This means, for example, that when ISO-8859-7 is designated, the following
69 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 *    The ICU ISO-2022 converter does not handle this - and has no information
71 *    about which subconverter would have to be shifted vs. which is designed
72 *    for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
78static const char SHIFT_IN_STR[]  = "\x0F";
79static const char SHIFT_OUT_STR[] = "\x0E";
80
81#define CR      0x0D
82#define LF      0x0A
83#define H_TAB   0x09
84#define V_TAB   0x0B
85#define SPACE   0x20
86
87enum {
88    HWKANA_START=0xff61,
89    HWKANA_END=0xff9f
90};
91
92/*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100enum {
101    GR94_START=0xa1,
102    GR94_END=0xfe,
103    GR96_START=0xa0,
104    GR96_END=0xff
105};
106
107/*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115/* for ISO-2022-JP and -CN implementations */
116typedef enum  {
117        /* shared values */
118        INVALID_STATE=-1,
119        ASCII = 0,
120
121        SS2_STATE=0x10,
122        SS3_STATE,
123
124        /* JP */
125        ISO8859_1 = 1 ,
126        ISO8859_7 = 2 ,
127        JISX201  = 3,
128        JISX208 = 4,
129        JISX212 = 5,
130        GB2312  =6,
131        KSC5601 =7,
132        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
133
134        /* CN */
135        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136        GB2312_1=1,
137        ISO_IR_165=2,
138        CNS_11643=3,
139
140        /*
141         * these are used in StateEnum and ISO2022State variables,
142         * but CNS_11643 must be used to index into myConverterArray[]
143         */
144        CNS_11643_0=0x20,
145        CNS_11643_1,
146        CNS_11643_2,
147        CNS_11643_3,
148        CNS_11643_4,
149        CNS_11643_5,
150        CNS_11643_6,
151        CNS_11643_7
152} StateEnum;
153
154/* is the StateEnum charset value for a DBCS charset? */
155#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157#define CSM(cs) ((uint16_t)1<<(cs))
158
159/*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 *   all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168enum { MAX_JA_VERSION=4 };
169static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
170    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
171    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
172    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
174    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
175};
176
177typedef enum {
178        ASCII1=0,
179        LATIN1,
180        SBCS,
181        DBCS,
182        MBCS,
183        HWKANA
184}Cnv2022Type;
185
186typedef struct ISO2022State {
187    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189    int8_t prevG;       /* g before single shift (SS2 or SS3) */
190} ISO2022State;
191
192#define UCNV_OPTIONS_VERSION_MASK 0xf
193#define UCNV_2022_MAX_CONVERTERS 10
194
195typedef struct{
196    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
197    UConverter *currentConverter;
198    Cnv2022Type currentType;
199    ISO2022State toU2022State, fromU2022State;
200    uint32_t key;
201    uint32_t version;
202#ifdef U_ENABLE_GENERIC_ISO_2022
203    UBool isFirstBuffer;
204#endif
205    UBool isEmptySegment;
206    char name[30];
207    char locale[3];
208}UConverterDataISO2022;
209
210/* Protos */
211/* ISO-2022 ----------------------------------------------------------------- */
212
213/*Forward declaration */
214U_CFUNC void
215ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
216                      UErrorCode * err);
217U_CFUNC void
218ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
219                                    UErrorCode * err);
220
221#define ESC_2022 0x1B /*ESC*/
222
223typedef enum
224{
225        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
228        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229} UCNV_TableStates_2022;
230
231/*
232* The way these state transition arrays work is:
233* ex : ESC$B is the sequence for JISX208
234*      a) First Iteration: char is ESC
235*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236*             int x = normalize_esq_chars_2022[27] which is equal to 1
237*         ii) Search for this value in escSeqStateTable_Key_2022[]
238*             value of x is stored at escSeqStateTable_Key_2022[0]
239*        iii) Save this index as offset
240*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242*     b) Switch on this state and continue to next char
243*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244*             which is normalize_esq_chars_2022[36] == 4
245*         ii) x is currently 1(from above)
246*               x<<=5 -- x is now 32
247*               x+=normalize_esq_chars_2022[36]
248*               now x is 36
249*        iii) Search for this value in escSeqStateTable_Key_2022[]
250*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253*     c) Switch on this state and continue to next char
254*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255*        ii) x is currently 36 (from above)
256*            x<<=5 -- x is now 1152
257*            x+=normalize_esq_chars_2022[66]
258*            now x is 1161
259*       iii) Search for this value in escSeqStateTable_Key_2022[]
260*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
264*/
265
266
267/*Below are the 3 arrays depicting a state transition table*/
268static const int8_t normalize_esq_chars_2022[256] = {
269/*       0      1       2       3       4      5       6        7       8       9           */
270
271         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
272        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
273        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
274        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
275        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
276        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
277        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
278        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
279        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
280        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
281        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
282        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
283        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
284        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296        ,0     ,0      ,0      ,0      ,0      ,0
297};
298
299#ifdef U_ENABLE_GENERIC_ISO_2022
300/*
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
304 *
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 *
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
312 *
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
315 */
316#endif
317
318#define MAX_STATES_2022 74
319static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
320/*   0           1           2           3           4           5           6           7           8           9           */
321
322     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
323    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
324    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
325    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
326    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
327    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
328    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
329    ,35947631   ,35947635   ,35947636   ,35947638
330};
331
332#ifdef U_ENABLE_GENERIC_ISO_2022
333
334static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
335 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
336
337     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
338    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
339    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
340    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
341    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
342    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
344    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
345};
346
347#endif
348
349static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
350/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
351     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
352    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
353    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
354    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
355    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
356    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
357    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
358    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
359};
360
361
362/* Type def for refactoring changeState_2022 code*/
363typedef enum{
364#ifdef U_ENABLE_GENERIC_ISO_2022
365    ISO_2022=0,
366#endif
367    ISO_2022_JP=1,
368    ISO_2022_KR=2,
369    ISO_2022_CN=3
370} Variant2022;
371
372/*********** ISO 2022 Converter Protos ***********/
373static void
374_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
375
376static void
377 _ISO2022Close(UConverter *converter);
378
379static void
380_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
381
382static const char*
383_ISO2022getName(const UConverter* cnv);
384
385static void
386_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
387
388static UConverter *
389_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
390
391#ifdef U_ENABLE_GENERIC_ISO_2022
392static void
393T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
394#endif
395
396/*const UConverterSharedData _ISO2022Data;*/
397static const UConverterSharedData _ISO2022JPData;
398static const UConverterSharedData _ISO2022KRData;
399static const UConverterSharedData _ISO2022CNData;
400
401/*************** Converter implementations ******************/
402
403/* The purpose of this function is to get around gcc compiler warnings. */
404static U_INLINE void
405fromUWriteUInt8(UConverter *cnv,
406                 const char *bytes, int32_t length,
407                 uint8_t **target, const char *targetLimit,
408                 int32_t **offsets,
409                 int32_t sourceIndex,
410                 UErrorCode *pErrorCode)
411{
412    char *targetChars = (char *)*target;
413    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
414                         offsets, sourceIndex, pErrorCode);
415    *target = (uint8_t*)targetChars;
416
417}
418
419static U_INLINE void
420setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
421    if(myConverterData->version == 1) {
422        UConverter *cnv = myConverterData->currentConverter;
423
424        cnv->toUnicodeStatus=0;     /* offset */
425        cnv->mode=0;                /* state */
426        cnv->toULength=0;           /* byteIndex */
427    }
428}
429
430static U_INLINE void
431setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
432   /* in ISO-2022-KR the designator sequence appears only once
433    * in a file so we append it only once
434    */
435    if( converter->charErrorBufferLength==0){
436
437        converter->charErrorBufferLength = 4;
438        converter->charErrorBuffer[0] = 0x1b;
439        converter->charErrorBuffer[1] = 0x24;
440        converter->charErrorBuffer[2] = 0x29;
441        converter->charErrorBuffer[3] = 0x43;
442    }
443    if(myConverterData->version == 1) {
444        UConverter *cnv = myConverterData->currentConverter;
445
446        cnv->fromUChar32=0;
447        cnv->fromUnicodeStatus=1;   /* prevLength */
448    }
449}
450
451static void
452_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
453
454    char myLocale[6]={' ',' ',' ',' ',' ',' '};
455
456    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
457    if(cnv->extraInfo != NULL) {
458        UConverterNamePieces stackPieces;
459        UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
460        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
461        uint32_t version;
462
463        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
464
465        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
466        myConverterData->currentType = ASCII1;
467        cnv->fromUnicodeStatus =FALSE;
468        if(pArgs->locale){
469            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
470        }
471        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
472        myConverterData->version = version;
473        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
474            (myLocale[2]=='_' || myLocale[2]=='\0'))
475        {
476            size_t len=0;
477            /* open the required converters and cache them */
478            if(version>MAX_JA_VERSION) {
479                /* prevent indexing beyond jpCharsetMasks[] */
480                myConverterData->version = version = 0;
481            }
482            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
483                myConverterData->myConverterArray[ISO8859_7] =
484                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
485            }
486            myConverterData->myConverterArray[JISX208] =
487                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
488            if(jpCharsetMasks[version]&CSM(JISX212)) {
489                myConverterData->myConverterArray[JISX212] =
490                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
491            }
492            if(jpCharsetMasks[version]&CSM(GB2312)) {
493                myConverterData->myConverterArray[GB2312] =
494                    ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
495            }
496            if(jpCharsetMasks[version]&CSM(KSC5601)) {
497                myConverterData->myConverterArray[KSC5601] =
498                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
499            }
500
501            /* set the function pointers to appropriate funtions */
502            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
503            uprv_strcpy(myConverterData->locale,"ja");
504
505            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
506            len = uprv_strlen(myConverterData->name);
507            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
508            myConverterData->name[len+1]='\0';
509        }
510        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
511            (myLocale[2]=='_' || myLocale[2]=='\0'))
512        {
513            const char *cnvName;
514            if(version==1) {
515                cnvName="icu-internal-25546";
516            } else {
517                cnvName="ksc_5601";
518                myConverterData->version=version=0;
519            }
520            if(pArgs->onlyTestIsLoadable) {
521                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
522                uprv_free(cnv->extraInfo);
523                cnv->extraInfo=NULL;
524                return;
525            } else {
526                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
527                if (U_FAILURE(*errorCode)) {
528                    _ISO2022Close(cnv);
529                    return;
530                }
531
532                if(version==1) {
533                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
534                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
535                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
536                }else{
537                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
538                }
539
540                /* initialize the state variables */
541                setInitialStateToUnicodeKR(cnv, myConverterData);
542                setInitialStateFromUnicodeKR(cnv, myConverterData);
543
544                /* set the function pointers to appropriate funtions */
545                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
546                uprv_strcpy(myConverterData->locale,"ko");
547            }
548        }
549        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
550            (myLocale[2]=='_' || myLocale[2]=='\0'))
551        {
552
553            /* open the required converters and cache them */
554            myConverterData->myConverterArray[GB2312_1] =
555                ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
556            if(version==1) {
557                myConverterData->myConverterArray[ISO_IR_165] =
558                    ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
559            }
560            myConverterData->myConverterArray[CNS_11643] =
561                ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
562
563
564            /* set the function pointers to appropriate funtions */
565            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
566            uprv_strcpy(myConverterData->locale,"cn");
567
568            if (version==0){
569                myConverterData->version = 0;
570                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
571            }else if (version==1){
572                myConverterData->version = 1;
573                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
574            }else {
575                myConverterData->version = 2;
576                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
577            }
578        }
579        else{
580#ifdef U_ENABLE_GENERIC_ISO_2022
581            myConverterData->isFirstBuffer = TRUE;
582
583            /* append the UTF-8 escape sequence */
584            cnv->charErrorBufferLength = 3;
585            cnv->charErrorBuffer[0] = 0x1b;
586            cnv->charErrorBuffer[1] = 0x25;
587            cnv->charErrorBuffer[2] = 0x42;
588
589            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
590            /* initialize the state variables */
591            uprv_strcpy(myConverterData->name,"ISO_2022");
592#else
593            *errorCode = U_UNSUPPORTED_ERROR;
594            return;
595#endif
596        }
597
598        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
599
600        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
601            _ISO2022Close(cnv);
602        }
603    } else {
604        *errorCode = U_MEMORY_ALLOCATION_ERROR;
605    }
606}
607
608
609static void
610_ISO2022Close(UConverter *converter) {
611    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
612    UConverterSharedData **array = myData->myConverterArray;
613    int32_t i;
614
615    if (converter->extraInfo != NULL) {
616        /*close the array of converter pointers and free the memory*/
617        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
618            if(array[i]!=NULL) {
619                ucnv_unloadSharedDataIfReady(array[i]);
620            }
621        }
622
623        ucnv_close(myData->currentConverter);
624
625        if(!converter->isExtraLocal){
626            uprv_free (converter->extraInfo);
627            converter->extraInfo = NULL;
628        }
629    }
630}
631
632static void
633_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
634    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
635    if(choice<=UCNV_RESET_TO_UNICODE) {
636        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
637        myConverterData->key = 0;
638        myConverterData->isEmptySegment = FALSE;
639    }
640    if(choice!=UCNV_RESET_TO_UNICODE) {
641        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
642    }
643#ifdef U_ENABLE_GENERIC_ISO_2022
644    if(myConverterData->locale[0] == 0){
645        if(choice<=UCNV_RESET_TO_UNICODE) {
646            myConverterData->isFirstBuffer = TRUE;
647            myConverterData->key = 0;
648            if (converter->mode == UCNV_SO){
649                ucnv_close (myConverterData->currentConverter);
650                myConverterData->currentConverter=NULL;
651            }
652            converter->mode = UCNV_SI;
653        }
654        if(choice!=UCNV_RESET_TO_UNICODE) {
655            /* re-append UTF-8 escape sequence */
656            converter->charErrorBufferLength = 3;
657            converter->charErrorBuffer[0] = 0x1b;
658            converter->charErrorBuffer[1] = 0x28;
659            converter->charErrorBuffer[2] = 0x42;
660        }
661    }
662    else
663#endif
664    {
665        /* reset the state variables */
666        if(myConverterData->locale[0] == 'k'){
667            if(choice<=UCNV_RESET_TO_UNICODE) {
668                setInitialStateToUnicodeKR(converter, myConverterData);
669            }
670            if(choice!=UCNV_RESET_TO_UNICODE) {
671                setInitialStateFromUnicodeKR(converter, myConverterData);
672            }
673        }
674    }
675}
676
677static const char*
678_ISO2022getName(const UConverter* cnv){
679    if(cnv->extraInfo){
680        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
681        return myData->name;
682    }
683    return NULL;
684}
685
686
687/*************** to unicode *******************/
688/****************************************************************************
689 * Recognized escape sequences are
690 * <ESC>(B  ASCII
691 * <ESC>.A  ISO-8859-1
692 * <ESC>.F  ISO-8859-7
693 * <ESC>(J  JISX-201
694 * <ESC>(I  JISX-201
695 * <ESC>$B  JISX-208
696 * <ESC>$@  JISX-208
697 * <ESC>$(D JISX-212
698 * <ESC>$A  GB2312
699 * <ESC>$(C KSC5601
700 */
701static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
702/*      0                1               2               3               4               5               6               7               8               9    */
703    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
704    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
705    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
706    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
707    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
708    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
709    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
710    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
711};
712
713/*************** to unicode *******************/
714static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
715/*      0                1               2               3               4               5               6               7               8               9    */
716     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
717    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
718    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
719    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
720    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
721    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
722    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
723    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
724};
725
726
727static UCNV_TableStates_2022
728getKey_2022(char c,int32_t* key,int32_t* offset){
729    int32_t togo;
730    int32_t low = 0;
731    int32_t hi = MAX_STATES_2022;
732    int32_t oldmid=0;
733
734    togo = normalize_esq_chars_2022[(uint8_t)c];
735    if(togo == 0) {
736        /* not a valid character anywhere in an escape sequence */
737        *key = 0;
738        *offset = 0;
739        return INVALID_2022;
740    }
741    togo = (*key << 5) + togo;
742
743    while (hi != low)  /*binary search*/{
744
745        register int32_t mid = (hi+low) >> 1; /*Finds median*/
746
747        if (mid == oldmid)
748            break;
749
750        if (escSeqStateTable_Key_2022[mid] > togo){
751            hi = mid;
752        }
753        else if (escSeqStateTable_Key_2022[mid] < togo){
754            low = mid;
755        }
756        else /*we found it*/{
757            *key = togo;
758            *offset = mid;
759            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
760        }
761        oldmid = mid;
762
763    }
764
765    *key = 0;
766    *offset = 0;
767    return INVALID_2022;
768}
769
770/*runs through a state machine to determine the escape sequence - codepage correspondance
771 */
772static void
773changeState_2022(UConverter* _this,
774                const char** source,
775                const char* sourceLimit,
776                Variant2022 var,
777                UErrorCode* err){
778    UCNV_TableStates_2022 value;
779    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
780    uint32_t key = myData2022->key;
781    int32_t offset = 0;
782    int8_t initialToULength = _this->toULength;
783    char c;
784
785    value = VALID_NON_TERMINAL_2022;
786    while (*source < sourceLimit) {
787        c = *(*source)++;
788        _this->toUBytes[_this->toULength++]=(uint8_t)c;
789        value = getKey_2022(c,(int32_t *) &key, &offset);
790
791        switch (value){
792
793        case VALID_NON_TERMINAL_2022 :
794            /* continue with the loop */
795            break;
796
797        case VALID_TERMINAL_2022:
798            key = 0;
799            goto DONE;
800
801        case INVALID_2022:
802            goto DONE;
803
804        case VALID_MAYBE_TERMINAL_2022:
805#ifdef U_ENABLE_GENERIC_ISO_2022
806            /* ESC ( B is ambiguous only for ISO_2022 itself */
807            if(var == ISO_2022) {
808                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
809                _this->toULength = 0;
810
811                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
812
813                /* continue with the loop */
814                value = VALID_NON_TERMINAL_2022;
815                break;
816            } else
817#endif
818            {
819                /* not ISO_2022 itself, finish here */
820                value = VALID_TERMINAL_2022;
821                key = 0;
822                goto DONE;
823            }
824        }
825    }
826
827DONE:
828    myData2022->key = key;
829
830    if (value == VALID_NON_TERMINAL_2022) {
831        /* indicate that the escape sequence is incomplete: key!=0 */
832        return;
833    } else if (value == INVALID_2022 ) {
834        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
835    } else /* value == VALID_TERMINAL_2022 */ {
836        switch(var){
837#ifdef U_ENABLE_GENERIC_ISO_2022
838        case ISO_2022:
839        {
840            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
841            if(chosenConverterName == NULL) {
842                /* SS2 or SS3 */
843                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844                _this->toUCallbackReason = UCNV_UNASSIGNED;
845                return;
846            }
847
848            _this->mode = UCNV_SI;
849            ucnv_close(myData2022->currentConverter);
850            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
851            if(U_SUCCESS(*err)) {
852                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
853                _this->mode = UCNV_SO;
854            }
855            break;
856        }
857#endif
858        case ISO_2022_JP:
859            {
860                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
861                switch(tempState) {
862                case INVALID_STATE:
863                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
864                    break;
865                case SS2_STATE:
866                    if(myData2022->toU2022State.cs[2]!=0) {
867                        if(myData2022->toU2022State.g<2) {
868                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
869                        }
870                        myData2022->toU2022State.g=2;
871                    } else {
872                        /* illegal to have SS2 before a matching designator */
873                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
874                    }
875                    break;
876                /* case SS3_STATE: not used in ISO-2022-JP-x */
877                case ISO8859_1:
878                case ISO8859_7:
879                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
880                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
881                    } else {
882                        /* G2 charset for SS2 */
883                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
884                    }
885                    break;
886                default:
887                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
888                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
889                    } else {
890                        /* G0 charset */
891                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
892                    }
893                    break;
894                }
895            }
896            break;
897        case ISO_2022_CN:
898            {
899                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
900                switch(tempState) {
901                case INVALID_STATE:
902                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
903                    break;
904                case SS2_STATE:
905                    if(myData2022->toU2022State.cs[2]!=0) {
906                        if(myData2022->toU2022State.g<2) {
907                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
908                        }
909                        myData2022->toU2022State.g=2;
910                    } else {
911                        /* illegal to have SS2 before a matching designator */
912                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
913                    }
914                    break;
915                case SS3_STATE:
916                    if(myData2022->toU2022State.cs[3]!=0) {
917                        if(myData2022->toU2022State.g<2) {
918                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
919                        }
920                        myData2022->toU2022State.g=3;
921                    } else {
922                        /* illegal to have SS3 before a matching designator */
923                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
924                    }
925                    break;
926                case ISO_IR_165:
927                    if(myData2022->version==0) {
928                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
929                        break;
930                    }
931                    /*fall through*/
932                case GB2312_1:
933                    /*fall through*/
934                case CNS_11643_1:
935                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
936                    break;
937                case CNS_11643_2:
938                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
939                    break;
940                default:
941                    /* other CNS 11643 planes */
942                    if(myData2022->version==0) {
943                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
944                    } else {
945                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
946                    }
947                    break;
948                }
949            }
950            break;
951        case ISO_2022_KR:
952            if(offset==0x30){
953                /* nothing to be done, just accept this one escape sequence */
954            } else {
955                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
956            }
957            break;
958
959        default:
960            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
961            break;
962        }
963    }
964    if(U_SUCCESS(*err)) {
965        _this->toULength = 0;
966    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
967        if(_this->toULength>1) {
968            /*
969             * Ticket 5691: consistent illegal sequences:
970             * - We include at least the first byte (ESC) in the illegal sequence.
971             * - If any of the non-initial bytes could be the start of a character,
972             *   we stop the illegal sequence before the first one of those.
973             *   In escape sequences, all following bytes are "printable", that is,
974             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
975             *   they are valid single/lead bytes.
976             *   For simplicity, we always only report the initial ESC byte as the
977             *   illegal sequence and back out all other bytes we looked at.
978             */
979            /* Back out some bytes. */
980            int8_t backOutDistance=_this->toULength-1;
981            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
982            if(backOutDistance<=bytesFromThisBuffer) {
983                /* same as initialToULength<=1 */
984                *source-=backOutDistance;
985            } else {
986                /* Back out bytes from the previous buffer: Need to replay them. */
987                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
988                /* same as -(initialToULength-1) */
989                /* preToULength is negative! */
990                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
991                *source-=bytesFromThisBuffer;
992            }
993            _this->toULength=1;
994        }
995    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
996        _this->toUCallbackReason = UCNV_UNASSIGNED;
997    }
998}
999
1000/*Checks the characters of the buffer against valid 2022 escape sequences
1001*if the match we return a pointer to the initial start of the sequence otherwise
1002*we return sourceLimit
1003*/
1004/*for 2022 looks ahead in the stream
1005 *to determine the longest possible convertible
1006 *data stream
1007 */
1008static U_INLINE const char*
1009getEndOfBuffer_2022(const char** source,
1010                   const char* sourceLimit,
1011                   UBool flush){
1012
1013    const char* mySource = *source;
1014
1015#ifdef U_ENABLE_GENERIC_ISO_2022
1016    if (*source >= sourceLimit)
1017        return sourceLimit;
1018
1019    do{
1020
1021        if (*mySource == ESC_2022){
1022            int8_t i;
1023            int32_t key = 0;
1024            int32_t offset;
1025            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1026
1027            /* Kludge: I could not
1028            * figure out the reason for validating an escape sequence
1029            * twice - once here and once in changeState_2022().
1030            * is it possible to have an ESC character in a ISO2022
1031            * byte stream which is valid in a code page? Is it legal?
1032            */
1033            for (i=0;
1034            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1035            i++) {
1036                value =  getKey_2022(*(mySource+i), &key, &offset);
1037            }
1038            if (value > 0 || *mySource==ESC_2022)
1039                return mySource;
1040
1041            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1042                return sourceLimit;
1043        }
1044    }while (++mySource < sourceLimit);
1045
1046    return sourceLimit;
1047#else
1048    while(mySource < sourceLimit && *mySource != ESC_2022) {
1049        ++mySource;
1050    }
1051    return mySource;
1052#endif
1053}
1054
1055
1056/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1057 * any future change in _MBCSFromUChar32() function should be reflected here.
1058 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1059 */
1060static U_INLINE int32_t
1061MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1062                                         UChar32 c,
1063                                         uint32_t* value,
1064                                         UBool useFallback,
1065                                         int outputType)
1066{
1067    const int32_t *cx;
1068    const uint16_t *table;
1069    uint32_t stage2Entry;
1070    uint32_t myValue;
1071    int32_t length;
1072    const uint8_t *p;
1073    /*
1074     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1075     * Use internal version of ucnv_open() that verifies that the new structures are available,
1076     * else U_INTERNAL_PROGRAM_ERROR.
1077     */
1078    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1079    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1080        table=sharedData->mbcs.fromUnicodeTable;
1081        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1082        /* get the bytes and the length for the output */
1083        if(outputType==MBCS_OUTPUT_2){
1084            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1085            if(myValue<=0xff) {
1086                length=1;
1087            } else {
1088                length=2;
1089            }
1090        } else /* outputType==MBCS_OUTPUT_3 */ {
1091            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1092            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1093            if(myValue<=0xff) {
1094                length=1;
1095            } else if(myValue<=0xffff) {
1096                length=2;
1097            } else {
1098                length=3;
1099            }
1100        }
1101        /* is this code point assigned, or do we use fallbacks? */
1102        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1103            /* assigned */
1104            *value=myValue;
1105            return length;
1106        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1107            /*
1108             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1109             * There is no way with this data structure for fallback output
1110             * to be a zero byte.
1111             */
1112            *value=myValue;
1113            return -length;
1114        }
1115    }
1116
1117    cx=sharedData->mbcs.extIndexes;
1118    if(cx!=NULL) {
1119        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1120    }
1121
1122    /* unassigned */
1123    return 0;
1124}
1125
1126/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1127 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1128 * @param retval pointer to output byte
1129 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1130 */
1131static U_INLINE int32_t
1132MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1133                                       UChar32 c,
1134                                       uint32_t* retval,
1135                                       UBool useFallback)
1136{
1137    const uint16_t *table;
1138    int32_t value;
1139    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1140    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1141        return 0;
1142    }
1143    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1144    table=sharedData->mbcs.fromUnicodeTable;
1145    /* get the byte for the output */
1146    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1147    /* is this code point assigned, or do we use fallbacks? */
1148    *retval=(uint32_t)(value&0xff);
1149    if(value>=0xf00) {
1150        return 1;  /* roundtrip */
1151    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1152        return -1;  /* fallback taken */
1153    } else {
1154        return 0;  /* no mapping */
1155    }
1156}
1157
1158/*
1159 * Check that the result is a 2-byte value with each byte in the range A1..FE
1160 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1161 * to move it to the ISO 2022 range 21..7E.
1162 * Return 0 if out of range.
1163 */
1164static U_INLINE uint32_t
1165_2022FromGR94DBCS(uint32_t value) {
1166    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1167        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1168    ) {
1169        return value - 0x8080;  /* shift down to 21..7e byte range */
1170    } else {
1171        return 0;  /* not valid for ISO 2022 */
1172    }
1173}
1174
1175#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1176/*
1177 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1178 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1179 * unchanged.
1180 */
1181static U_INLINE uint32_t
1182_2022ToGR94DBCS(uint32_t value) {
1183    uint32_t returnValue = value + 0x8080;
1184    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1185        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1186        return returnValue;
1187    } else {
1188        return value;
1189    }
1190}
1191#endif
1192
1193#ifdef U_ENABLE_GENERIC_ISO_2022
1194
1195/**********************************************************************************
1196*  ISO-2022 Converter
1197*
1198*
1199*/
1200
1201static void
1202T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1203                                                           UErrorCode* err){
1204    const char* mySourceLimit, *realSourceLimit;
1205    const char* sourceStart;
1206    const UChar* myTargetStart;
1207    UConverter* saveThis;
1208    UConverterDataISO2022* myData;
1209    int8_t length;
1210
1211    saveThis = args->converter;
1212    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1213
1214    realSourceLimit = args->sourceLimit;
1215    while (args->source < realSourceLimit) {
1216        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1217            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1218            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1219
1220            if(args->source < mySourceLimit) {
1221                if(myData->currentConverter==NULL) {
1222                    myData->currentConverter = ucnv_open("ASCII",err);
1223                    if(U_FAILURE(*err)){
1224                        return;
1225                    }
1226
1227                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1228                    saveThis->mode = UCNV_SO;
1229                }
1230
1231                /* convert to before the ESC or until the end of the buffer */
1232                myData->isFirstBuffer=FALSE;
1233                sourceStart = args->source;
1234                myTargetStart = args->target;
1235                args->converter = myData->currentConverter;
1236                ucnv_toUnicode(args->converter,
1237                    &args->target,
1238                    args->targetLimit,
1239                    &args->source,
1240                    mySourceLimit,
1241                    args->offsets,
1242                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1243                    err);
1244                args->converter = saveThis;
1245
1246                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1247                    /* move the overflow buffer */
1248                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1249                    myData->currentConverter->UCharErrorBufferLength = 0;
1250                    if(length > 0) {
1251                        uprv_memcpy(saveThis->UCharErrorBuffer,
1252                                    myData->currentConverter->UCharErrorBuffer,
1253                                    length*U_SIZEOF_UCHAR);
1254                    }
1255                    return;
1256                }
1257
1258                /*
1259                 * At least one of:
1260                 * -Error while converting
1261                 * -Done with entire buffer
1262                 * -Need to write offsets or update the current offset
1263                 *  (leave that up to the code in ucnv.c)
1264                 *
1265                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1266                 */
1267                if (U_FAILURE(*err) ||
1268                    (args->source == realSourceLimit) ||
1269                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1270                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1271                ) {
1272                    /* copy partial or error input for truncated detection and error handling */
1273                    if(U_FAILURE(*err)) {
1274                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1275                        if(length > 0) {
1276                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1277                        }
1278                    } else {
1279                        length = saveThis->toULength = myData->currentConverter->toULength;
1280                        if(length > 0) {
1281                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1282                            if(args->source < mySourceLimit) {
1283                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1284                            }
1285                        }
1286                    }
1287                    return;
1288                }
1289            }
1290        }
1291
1292        sourceStart = args->source;
1293        changeState_2022(args->converter,
1294               &(args->source),
1295               realSourceLimit,
1296               ISO_2022,
1297               err);
1298        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1299            /* let the ucnv.c code update its current offset */
1300            return;
1301        }
1302    }
1303}
1304
1305#endif
1306
1307/*
1308 * To Unicode Callback helper function
1309 */
1310static void
1311toUnicodeCallback(UConverter *cnv,
1312                  const uint32_t sourceChar, const uint32_t targetUniChar,
1313                  UErrorCode* err){
1314    if(sourceChar>0xff){
1315        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1316        cnv->toUBytes[1] = (uint8_t)sourceChar;
1317        cnv->toULength = 2;
1318    }
1319    else{
1320        cnv->toUBytes[0] =(char) sourceChar;
1321        cnv->toULength = 1;
1322    }
1323
1324    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1325        *err = U_INVALID_CHAR_FOUND;
1326    }
1327    else{
1328        *err = U_ILLEGAL_CHAR_FOUND;
1329    }
1330}
1331
1332/**************************************ISO-2022-JP*************************************************/
1333
1334/************************************** IMPORTANT **************************************************
1335* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1336* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1337* The converter iterates over each Unicode codepoint
1338* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1339* processed one char at a time it would make sense to reduce the extra processing a canned converter
1340* would do as far as possible.
1341*
1342* If the implementation of these macros or structure of sharedData struct change in the future, make
1343* sure that ISO-2022 is also changed.
1344***************************************************************************************************
1345*/
1346
1347/***************************************************************************************************
1348* Rules for ISO-2022-jp encoding
1349* (i)   Escape sequences must be fully contained within a line they should not
1350*       span new lines or CRs
1351* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1352*       JIS-Roman character escape sequence should follow before the line terminates
1353* (iii) If the first character on the line is represented by two bytes then a two
1354*       byte character escape sequence should precede it
1355* (iv)  If no escape sequence is encountered then the characters are ASCII
1356* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1357*       and invoked with SS2 (ESC N).
1358* (vi)  If there is any G0 designation in text, there must be a switch to
1359*       ASCII or to JIS X 0201-Roman before a space character (but not
1360*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1361*       characters such as tab or CRLF.
1362* (vi)  Supported encodings:
1363*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1364*
1365*  source : RFC-1554
1366*
1367*          JISX201, JISX208,JISX212 : new .cnv data files created
1368*          KSC5601 : alias to ibm-949 mapping table
1369*          GB2312 : alias to ibm-1386 mapping table
1370*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1371*          ISO-8859-7 : alisas to ibm-9409 mapping table
1372*/
1373
1374/* preference order of JP charsets */
1375static const StateEnum jpCharsetPref[]={
1376    ASCII,
1377    JISX201,
1378    ISO8859_1,
1379    ISO8859_7,
1380    JISX208,
1381    JISX212,
1382    GB2312,
1383    KSC5601,
1384    HWKANA_7BIT
1385};
1386
1387/*
1388 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1389 * not in order of jpCharsetPref[]!
1390 */
1391static const char escSeqChars[][6] ={
1392    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1393    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1394    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1395    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1396    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1397    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1398    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1399    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1400    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1401
1402};
1403static  const int8_t escSeqCharsLen[] ={
1404    3, /* length of <ESC>(B  ASCII       */
1405    3, /* length of <ESC>.A  ISO-8859-1  */
1406    3, /* length of <ESC>.F  ISO-8859-7  */
1407    3, /* length of <ESC>(J  JISX-201    */
1408    3, /* length of <ESC>$B  JISX-208    */
1409    4, /* length of <ESC>$(D JISX-212    */
1410    3, /* length of <ESC>$A  GB2312      */
1411    4, /* length of <ESC>$(C KSC5601     */
1412    3  /* length of <ESC>(I  HWKANA_7BIT */
1413};
1414
1415/*
1416* The iteration over various code pages works this way:
1417* i)   Get the currentState from myConverterData->currentState
1418* ii)  Check if the character is mapped to a valid character in the currentState
1419*      Yes ->  a) set the initIterState to currentState
1420*       b) remain in this state until an invalid character is found
1421*      No  ->  a) go to the next code page and find the character
1422* iii) Before changing the state increment the current state check if the current state
1423*      is equal to the intitIteration state
1424*      Yes ->  A character that cannot be represented in any of the supported encodings
1425*       break and return a U_INVALID_CHARACTER error
1426*      No  ->  Continue and find the character in next code page
1427*
1428*
1429* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1430*/
1431
1432/* Map 00..7F to Unicode according to JIS X 0201. */
1433static U_INLINE uint32_t
1434jisx201ToU(uint32_t value) {
1435    if(value < 0x5c) {
1436        return value;
1437    } else if(value == 0x5c) {
1438        return 0xa5;
1439    } else if(value == 0x7e) {
1440        return 0x203e;
1441    } else /* value <= 0x7f */ {
1442        return value;
1443    }
1444}
1445
1446/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1447static U_INLINE uint32_t
1448jisx201FromU(uint32_t value) {
1449    if(value<=0x7f) {
1450        if(value!=0x5c && value!=0x7e) {
1451            return value;
1452        }
1453    } else if(value==0xa5) {
1454        return 0x5c;
1455    } else if(value==0x203e) {
1456        return 0x7e;
1457    }
1458    return 0xfffe;
1459}
1460
1461/*
1462 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1463 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1464 * Return 0 if the byte pair is out of range.
1465 */
1466static U_INLINE uint32_t
1467_2022FromSJIS(uint32_t value) {
1468    uint8_t trail;
1469
1470    if(value > 0xEFFC) {
1471        return 0;  /* beyond JIS X 0208 */
1472    }
1473
1474    trail = (uint8_t)value;
1475
1476    value &= 0xff00;  /* lead byte */
1477    if(value <= 0x9f00) {
1478        value -= 0x7000;
1479    } else /* 0xe000 <= value <= 0xef00 */ {
1480        value -= 0xb000;
1481    }
1482    value <<= 1;
1483
1484    if(trail <= 0x9e) {
1485        value -= 0x100;
1486        if(trail <= 0x7e) {
1487            value |= trail - 0x1f;
1488        } else {
1489            value |= trail - 0x20;
1490        }
1491    } else /* trail <= 0xfc */ {
1492        value |= trail - 0x7e;
1493    }
1494    return value;
1495}
1496
1497/*
1498 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1499 * If either byte is outside 21..7E make sure that the result is not valid
1500 * for Shift-JIS so that the converter catches it.
1501 * Some invalid byte values already turn into equally invalid Shift-JIS
1502 * byte values and need not be tested explicitly.
1503 */
1504static U_INLINE void
1505_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1506    if(c1&1) {
1507        ++c1;
1508        if(c2 <= 0x5f) {
1509            c2 += 0x1f;
1510        } else if(c2 <= 0x7e) {
1511            c2 += 0x20;
1512        } else {
1513            c2 = 0;  /* invalid */
1514        }
1515    } else {
1516        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1517            c2 += 0x7e;
1518        } else {
1519            c2 = 0;  /* invalid */
1520        }
1521    }
1522    c1 >>= 1;
1523    if(c1 <= 0x2f) {
1524        c1 += 0x70;
1525    } else if(c1 <= 0x3f) {
1526        c1 += 0xb0;
1527    } else {
1528        c1 = 0;  /* invalid */
1529    }
1530    bytes[0] = (char)c1;
1531    bytes[1] = (char)c2;
1532}
1533
1534/*
1535 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1536 * Katakana.
1537 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1538 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1539 * These were the only fallbacks in ICU's jisx-208.ucm file.
1540 */
1541static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1542    0x2123,  /* U+FF61 */
1543    0x2156,
1544    0x2157,
1545    0x2122,
1546    0x2126,
1547    0x2572,
1548    0x2521,
1549    0x2523,
1550    0x2525,
1551    0x2527,
1552    0x2529,
1553    0x2563,
1554    0x2565,
1555    0x2567,
1556    0x2543,
1557    0x213C,  /* U+FF70 */
1558    0x2522,
1559    0x2524,
1560    0x2526,
1561    0x2528,
1562    0x252A,
1563    0x252B,
1564    0x252D,
1565    0x252F,
1566    0x2531,
1567    0x2533,
1568    0x2535,
1569    0x2537,
1570    0x2539,
1571    0x253B,
1572    0x253D,
1573    0x253F,  /* U+FF80 */
1574    0x2541,
1575    0x2544,
1576    0x2546,
1577    0x2548,
1578    0x254A,
1579    0x254B,
1580    0x254C,
1581    0x254D,
1582    0x254E,
1583    0x254F,
1584    0x2552,
1585    0x2555,
1586    0x2558,
1587    0x255B,
1588    0x255E,
1589    0x255F,  /* U+FF90 */
1590    0x2560,
1591    0x2561,
1592    0x2562,
1593    0x2564,
1594    0x2566,
1595    0x2568,
1596    0x2569,
1597    0x256A,
1598    0x256B,
1599    0x256C,
1600    0x256D,
1601    0x256F,
1602    0x2573,
1603    0x212B,
1604    0x212C   /* U+FF9F */
1605};
1606
1607static void
1608UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1609    UConverter *cnv = args->converter;
1610    UConverterDataISO2022 *converterData;
1611    ISO2022State *pFromU2022State;
1612    uint8_t *target = (uint8_t *) args->target;
1613    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1614    const UChar* source = args->source;
1615    const UChar* sourceLimit = args->sourceLimit;
1616    int32_t* offsets = args->offsets;
1617    UChar32 sourceChar;
1618    char buffer[8];
1619    int32_t len, outLen;
1620    int8_t choices[10];
1621    int32_t choiceCount;
1622    uint32_t targetValue = 0;
1623    UBool useFallback;
1624
1625    int32_t i;
1626    int8_t cs, g;
1627
1628    /* set up the state */
1629    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1630    pFromU2022State   = &converterData->fromU2022State;
1631
1632    choiceCount = 0;
1633
1634    /* check if the last codepoint of previous buffer was a lead surrogate*/
1635    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1636        goto getTrail;
1637    }
1638
1639    while(source < sourceLimit) {
1640        if(target < targetLimit) {
1641
1642            sourceChar  = *(source++);
1643            /*check if the char is a First surrogate*/
1644            if(UTF_IS_SURROGATE(sourceChar)) {
1645                if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1646getTrail:
1647                    /*look ahead to find the trail surrogate*/
1648                    if(source < sourceLimit) {
1649                        /* test the following code unit */
1650                        UChar trail=(UChar) *source;
1651                        if(UTF_IS_SECOND_SURROGATE(trail)) {
1652                            source++;
1653                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1654                            cnv->fromUChar32=0x00;
1655                            /* convert this supplementary code point */
1656                            /* exit this condition tree */
1657                        } else {
1658                            /* this is an unmatched lead code unit (1st surrogate) */
1659                            /* callback(illegal) */
1660                            *err=U_ILLEGAL_CHAR_FOUND;
1661                            cnv->fromUChar32=sourceChar;
1662                            break;
1663                        }
1664                    } else {
1665                        /* no more input */
1666                        cnv->fromUChar32=sourceChar;
1667                        break;
1668                    }
1669                } else {
1670                    /* this is an unmatched trail code unit (2nd surrogate) */
1671                    /* callback(illegal) */
1672                    *err=U_ILLEGAL_CHAR_FOUND;
1673                    cnv->fromUChar32=sourceChar;
1674                    break;
1675                }
1676            }
1677
1678            /* do not convert SO/SI/ESC */
1679            if(IS_2022_CONTROL(sourceChar)) {
1680                /* callback(illegal) */
1681                *err=U_ILLEGAL_CHAR_FOUND;
1682                cnv->fromUChar32=sourceChar;
1683                break;
1684            }
1685
1686            /* do the conversion */
1687
1688            if(choiceCount == 0) {
1689                uint16_t csm;
1690
1691                /*
1692                 * The csm variable keeps track of which charsets are allowed
1693                 * and not used yet while building the choices[].
1694                 */
1695                csm = jpCharsetMasks[converterData->version];
1696                choiceCount = 0;
1697
1698                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1699                if(converterData->version == 3 || converterData->version == 4) {
1700                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1701                }
1702                /* Do not try single-byte half-width Katakana for other versions. */
1703                csm &= ~CSM(HWKANA_7BIT);
1704
1705                /* try the current G0 charset */
1706                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1707                csm &= ~CSM(cs);
1708
1709                /* try the current G2 charset */
1710                if((cs = pFromU2022State->cs[2]) != 0) {
1711                    choices[choiceCount++] = cs;
1712                    csm &= ~CSM(cs);
1713                }
1714
1715                /* try all the other possible charsets */
1716                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1717                    cs = (int8_t)jpCharsetPref[i];
1718                    if(CSM(cs) & csm) {
1719                        choices[choiceCount++] = cs;
1720                        csm &= ~CSM(cs);
1721                    }
1722                }
1723            }
1724
1725            cs = g = 0;
1726            /*
1727             * len==0: no mapping found yet
1728             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1729             * len>0: found a roundtrip result, done
1730             */
1731            len = 0;
1732            /*
1733             * We will turn off useFallback after finding a fallback,
1734             * but we still get fallbacks from PUA code points as usual.
1735             * Therefore, we will also need to check that we don't overwrite
1736             * an early fallback with a later one.
1737             */
1738            useFallback = cnv->useFallback;
1739
1740            for(i = 0; i < choiceCount && len <= 0; ++i) {
1741                uint32_t value;
1742                int32_t len2;
1743                int8_t cs0 = choices[i];
1744                switch(cs0) {
1745                case ASCII:
1746                    if(sourceChar <= 0x7f) {
1747                        targetValue = (uint32_t)sourceChar;
1748                        len = 1;
1749                        cs = cs0;
1750                        g = 0;
1751                    }
1752                    break;
1753                case ISO8859_1:
1754                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1755                        targetValue = (uint32_t)sourceChar - 0x80;
1756                        len = 1;
1757                        cs = cs0;
1758                        g = 2;
1759                    }
1760                    break;
1761                case HWKANA_7BIT:
1762                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1763                        if(converterData->version==3) {
1764                            /* JIS7: use G1 (SO) */
1765                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1766                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1767                            len = 1;
1768                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1769                            g = 1;
1770                        } else if(converterData->version==4) {
1771                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1772                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1773                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1774                            len = 1;
1775
1776                            cs = pFromU2022State->cs[0];
1777                            if(IS_JP_DBCS(cs)) {
1778                                /* switch from a DBCS charset to JISX201 */
1779                                cs = (int8_t)JISX201;
1780                            }
1781                            /* else stay in the current G0 charset */
1782                            g = 0;
1783                        }
1784                        /* else do not use HWKANA_7BIT with other versions */
1785                    }
1786                    break;
1787                case JISX201:
1788                    /* G0 SBCS */
1789                    value = jisx201FromU(sourceChar);
1790                    if(value <= 0x7f) {
1791                        targetValue = value;
1792                        len = 1;
1793                        cs = cs0;
1794                        g = 0;
1795                        useFallback = FALSE;
1796                    }
1797                    break;
1798                case JISX208:
1799                    /* G0 DBCS from Shift-JIS table */
1800                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1801                                converterData->myConverterArray[cs0],
1802                                sourceChar, &value,
1803                                useFallback, MBCS_OUTPUT_2);
1804                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1805                        value = _2022FromSJIS(value);
1806                        if(value != 0) {
1807                            targetValue = value;
1808                            len = len2;
1809                            cs = cs0;
1810                            g = 0;
1811                            useFallback = FALSE;
1812                        }
1813                    } else if(len == 0 && useFallback &&
1814                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1815                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1816                        len = -2;
1817                        cs = cs0;
1818                        g = 0;
1819                        useFallback = FALSE;
1820                    }
1821                    break;
1822                case ISO8859_7:
1823                    /* G0 SBCS forced to 7-bit output */
1824                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1825                                converterData->myConverterArray[cs0],
1826                                sourceChar, &value,
1827                                useFallback);
1828                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1829                        targetValue = value - 0x80;
1830                        len = len2;
1831                        cs = cs0;
1832                        g = 2;
1833                        useFallback = FALSE;
1834                    }
1835                    break;
1836                default:
1837                    /* G0 DBCS */
1838                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1839                                converterData->myConverterArray[cs0],
1840                                sourceChar, &value,
1841                                useFallback, MBCS_OUTPUT_2);
1842                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1843                        if(cs0 == KSC5601) {
1844                            /*
1845                             * Check for valid bytes for the encoding scheme.
1846                             * This is necessary because the sub-converter (windows-949)
1847                             * has a broader encoding scheme than is valid for 2022.
1848                             */
1849                            value = _2022FromGR94DBCS(value);
1850                            if(value == 0) {
1851                                break;
1852                            }
1853                        }
1854                        targetValue = value;
1855                        len = len2;
1856                        cs = cs0;
1857                        g = 0;
1858                        useFallback = FALSE;
1859                    }
1860                    break;
1861                }
1862            }
1863
1864            if(len != 0) {
1865                if(len < 0) {
1866                    len = -len;  /* fallback */
1867                }
1868                outLen = 0; /* count output bytes */
1869
1870                /* write SI if necessary (only for JIS7) */
1871                if(pFromU2022State->g == 1 && g == 0) {
1872                    buffer[outLen++] = UCNV_SI;
1873                    pFromU2022State->g = 0;
1874                }
1875
1876                /* write the designation sequence if necessary */
1877                if(cs != pFromU2022State->cs[g]) {
1878                    int32_t escLen = escSeqCharsLen[cs];
1879                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1880                    outLen += escLen;
1881                    pFromU2022State->cs[g] = cs;
1882
1883                    /* invalidate the choices[] */
1884                    choiceCount = 0;
1885                }
1886
1887                /* write the shift sequence if necessary */
1888                if(g != pFromU2022State->g) {
1889                    switch(g) {
1890                    /* case 0 handled before writing escapes */
1891                    case 1:
1892                        buffer[outLen++] = UCNV_SO;
1893                        pFromU2022State->g = 1;
1894                        break;
1895                    default: /* case 2 */
1896                        buffer[outLen++] = 0x1b;
1897                        buffer[outLen++] = 0x4e;
1898                        break;
1899                    /* no case 3: no SS3 in ISO-2022-JP-x */
1900                    }
1901                }
1902
1903                /* write the output bytes */
1904                if(len == 1) {
1905                    buffer[outLen++] = (char)targetValue;
1906                } else /* len == 2 */ {
1907                    buffer[outLen++] = (char)(targetValue >> 8);
1908                    buffer[outLen++] = (char)targetValue;
1909                }
1910            } else {
1911                /*
1912                 * if we cannot find the character after checking all codepages
1913                 * then this is an error
1914                 */
1915                *err = U_INVALID_CHAR_FOUND;
1916                cnv->fromUChar32=sourceChar;
1917                break;
1918            }
1919
1920            if(sourceChar == CR || sourceChar == LF) {
1921                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1922                pFromU2022State->cs[2] = 0;
1923                choiceCount = 0;
1924            }
1925
1926            /* output outLen>0 bytes in buffer[] */
1927            if(outLen == 1) {
1928                *target++ = buffer[0];
1929                if(offsets) {
1930                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1931                }
1932            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1933                *target++ = buffer[0];
1934                *target++ = buffer[1];
1935                if(offsets) {
1936                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1937                    *offsets++ = sourceIndex;
1938                    *offsets++ = sourceIndex;
1939                }
1940            } else {
1941                fromUWriteUInt8(
1942                    cnv,
1943                    buffer, outLen,
1944                    &target, (const char *)targetLimit,
1945                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1946                    err);
1947                if(U_FAILURE(*err)) {
1948                    break;
1949                }
1950            }
1951        } /* end if(myTargetIndex<myTargetLength) */
1952        else{
1953            *err =U_BUFFER_OVERFLOW_ERROR;
1954            break;
1955        }
1956
1957    }/* end while(mySourceIndex<mySourceLength) */
1958
1959    /*
1960     * the end of the input stream and detection of truncated input
1961     * are handled by the framework, but for ISO-2022-JP conversion
1962     * we need to be in ASCII mode at the very end
1963     *
1964     * conditions:
1965     *   successful
1966     *   in SO mode or not in ASCII mode
1967     *   end of input and no truncated input
1968     */
1969    if( U_SUCCESS(*err) &&
1970        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1971        args->flush && source>=sourceLimit && cnv->fromUChar32==0
1972    ) {
1973        int32_t sourceIndex;
1974
1975        outLen = 0;
1976
1977        if(pFromU2022State->g != 0) {
1978            buffer[outLen++] = UCNV_SI;
1979            pFromU2022State->g = 0;
1980        }
1981
1982        if(pFromU2022State->cs[0] != ASCII) {
1983            int32_t escLen = escSeqCharsLen[ASCII];
1984            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1985            outLen += escLen;
1986            pFromU2022State->cs[0] = (int8_t)ASCII;
1987        }
1988
1989        /* get the source index of the last input character */
1990        /*
1991         * TODO this would be simpler and more reliable if we used a pair
1992         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1993         * so that we could simply use the prevSourceIndex here;
1994         * this code gives an incorrect result for the rare case of an unmatched
1995         * trail surrogate that is alone in the last buffer of the text stream
1996         */
1997        sourceIndex=(int32_t)(source-args->source);
1998        if(sourceIndex>0) {
1999            --sourceIndex;
2000            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2001                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2002            ) {
2003                --sourceIndex;
2004            }
2005        } else {
2006            sourceIndex=-1;
2007        }
2008
2009        fromUWriteUInt8(
2010            cnv,
2011            buffer, outLen,
2012            &target, (const char *)targetLimit,
2013            &offsets, sourceIndex,
2014            err);
2015    }
2016
2017    /*save the state and return */
2018    args->source = source;
2019    args->target = (char*)target;
2020}
2021
2022/*************** to unicode *******************/
2023
2024static void
2025UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2026                                               UErrorCode* err){
2027    char tempBuf[2];
2028    const char *mySource = (char *) args->source;
2029    UChar *myTarget = args->target;
2030    const char *mySourceLimit = args->sourceLimit;
2031    uint32_t targetUniChar = 0x0000;
2032    uint32_t mySourceChar = 0x0000;
2033    uint32_t tmpSourceChar = 0x0000;
2034    UConverterDataISO2022* myData;
2035    ISO2022State *pToU2022State;
2036    StateEnum cs;
2037
2038    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2039    pToU2022State = &myData->toU2022State;
2040
2041    if(myData->key != 0) {
2042        /* continue with a partial escape sequence */
2043        goto escape;
2044    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2045        /* continue with a partial double-byte character */
2046        mySourceChar = args->converter->toUBytes[0];
2047        args->converter->toULength = 0;
2048        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2049        targetUniChar = missingCharMarker;
2050        goto getTrailByte;
2051    }
2052
2053    while(mySource < mySourceLimit){
2054
2055        targetUniChar =missingCharMarker;
2056
2057        if(myTarget < args->targetLimit){
2058
2059            mySourceChar= (unsigned char) *mySource++;
2060
2061            switch(mySourceChar) {
2062            case UCNV_SI:
2063                if(myData->version==3) {
2064                    pToU2022State->g=0;
2065                    continue;
2066                } else {
2067                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2069                    break;
2070                }
2071
2072            case UCNV_SO:
2073                if(myData->version==3) {
2074                    /* JIS7: switch to G1 half-width Katakana */
2075                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2076                    pToU2022State->g=1;
2077                    continue;
2078                } else {
2079                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2080                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2081                    break;
2082                }
2083
2084            case ESC_2022:
2085                mySource--;
2086escape:
2087                {
2088                    const char * mySourceBefore = mySource;
2089                    int8_t toULengthBefore = args->converter->toULength;
2090
2091                    changeState_2022(args->converter,&(mySource),
2092                        mySourceLimit, ISO_2022_JP,err);
2093
2094                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2095                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2096                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2097                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2098                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2099                    }
2100                }
2101
2102                /* invalid or illegal escape sequence */
2103                if(U_FAILURE(*err)){
2104                    args->target = myTarget;
2105                    args->source = mySource;
2106                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
2107                    return;
2108                }
2109                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2110                if(myData->key==0) {
2111                    myData->isEmptySegment = TRUE;
2112                }
2113                continue;
2114
2115            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2116
2117            case CR:
2118                /*falls through*/
2119            case LF:
2120                /* automatically reset to single-byte mode */
2121                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2122                    pToU2022State->cs[0] = (int8_t)ASCII;
2123                }
2124                pToU2022State->cs[2] = 0;
2125                pToU2022State->g = 0;
2126                /* falls through */
2127            default:
2128                /* convert one or two bytes */
2129                myData->isEmptySegment = FALSE;
2130                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2131                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2132                    !IS_JP_DBCS(cs)
2133                ) {
2134                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2135                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2136
2137                    /* return from a single-shift state to the previous one */
2138                    if(pToU2022State->g >= 2) {
2139                        pToU2022State->g=pToU2022State->prevG;
2140                    }
2141                } else switch(cs) {
2142                case ASCII:
2143                    if(mySourceChar <= 0x7f) {
2144                        targetUniChar = mySourceChar;
2145                    }
2146                    break;
2147                case ISO8859_1:
2148                    if(mySourceChar <= 0x7f) {
2149                        targetUniChar = mySourceChar + 0x80;
2150                    }
2151                    /* return from a single-shift state to the previous one */
2152                    pToU2022State->g=pToU2022State->prevG;
2153                    break;
2154                case ISO8859_7:
2155                    if(mySourceChar <= 0x7f) {
2156                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2157                        targetUniChar =
2158                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2159                                myData->myConverterArray[cs],
2160                                mySourceChar + 0x80);
2161                    }
2162                    /* return from a single-shift state to the previous one */
2163                    pToU2022State->g=pToU2022State->prevG;
2164                    break;
2165                case JISX201:
2166                    if(mySourceChar <= 0x7f) {
2167                        targetUniChar = jisx201ToU(mySourceChar);
2168                    }
2169                    break;
2170                case HWKANA_7BIT:
2171                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2172                        /* 7-bit halfwidth Katakana */
2173                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2174                    }
2175                    break;
2176                default:
2177                    /* G0 DBCS */
2178                    if(mySource < mySourceLimit) {
2179                        int leadIsOk, trailIsOk;
2180                        uint8_t trailByte;
2181getTrailByte:
2182                        trailByte = (uint8_t)*mySource;
2183                        /*
2184                         * Ticket 5691: consistent illegal sequences:
2185                         * - We include at least the first byte in the illegal sequence.
2186                         * - If any of the non-initial bytes could be the start of a character,
2187                         *   we stop the illegal sequence before the first one of those.
2188                         *
2189                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2190                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2191                         * Otherwise we convert or report the pair of bytes.
2192                         */
2193                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2194                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2195                        if (leadIsOk && trailIsOk) {
2196                            ++mySource;
2197                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2198                            if(cs == JISX208) {
2199                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2200                                mySourceChar = tmpSourceChar;
2201                            } else {
2202                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2203                                mySourceChar = tmpSourceChar;
2204                                if (cs == KSC5601) {
2205                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2206                                }
2207                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2208                                tempBuf[1] = (char)(tmpSourceChar);
2209                            }
2210                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2211                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2212                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2213                            ++mySource;
2214                            /* add another bit so that the code below writes 2 bytes in case of error */
2215                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2216                        }
2217                    } else {
2218                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2219                        args->converter->toULength = 1;
2220                        goto endloop;
2221                    }
2222                }  /* End of inner switch */
2223                break;
2224            }  /* End of outer switch */
2225            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2226                if(args->offsets){
2227                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2228                }
2229                *(myTarget++)=(UChar)targetUniChar;
2230            }
2231            else if(targetUniChar > missingCharMarker){
2232                /* disassemble the surrogate pair and write to output*/
2233                targetUniChar-=0x0010000;
2234                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2235                if(args->offsets){
2236                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2237                }
2238                ++myTarget;
2239                if(myTarget< args->targetLimit){
2240                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2241                    if(args->offsets){
2242                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2243                    }
2244                    ++myTarget;
2245                }else{
2246                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2247                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2248                }
2249
2250            }
2251            else{
2252                /* Call the callback function*/
2253                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2254                break;
2255            }
2256        }
2257        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2258            *err =U_BUFFER_OVERFLOW_ERROR;
2259            break;
2260        }
2261    }
2262endloop:
2263    args->target = myTarget;
2264    args->source = mySource;
2265}
2266
2267
2268/***************************************************************
2269*   Rules for ISO-2022-KR encoding
2270*   i) The KSC5601 designator sequence should appear only once in a file,
2271*      at the begining of a line before any KSC5601 characters. This usually
2272*      means that it appears by itself on the first line of the file
2273*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2274*      and SI to shift into single byte mode
2275*/
2276static void
2277UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2278
2279    UConverter* saveConv = args->converter;
2280    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2281    args->converter=myConverterData->currentConverter;
2282
2283    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2284    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2285    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2286
2287    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2288        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2289            uprv_memcpy(
2290                saveConv->charErrorBuffer,
2291                myConverterData->currentConverter->charErrorBuffer,
2292                myConverterData->currentConverter->charErrorBufferLength);
2293        }
2294        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2295        myConverterData->currentConverter->charErrorBufferLength = 0;
2296    }
2297    args->converter=saveConv;
2298}
2299
2300static void
2301UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2302
2303    const UChar *source = args->source;
2304    const UChar *sourceLimit = args->sourceLimit;
2305    unsigned char *target = (unsigned char *) args->target;
2306    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2307    int32_t* offsets = args->offsets;
2308    uint32_t targetByteUnit = 0x0000;
2309    UChar32 sourceChar = 0x0000;
2310    UBool isTargetByteDBCS;
2311    UBool oldIsTargetByteDBCS;
2312    UConverterDataISO2022 *converterData;
2313    UConverterSharedData* sharedData;
2314    UBool useFallback;
2315    int32_t length =0;
2316
2317    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2318    /* if the version is 1 then the user is requesting
2319     * conversion with ibm-25546 pass the arguments to
2320     * MBCS converter and return
2321     */
2322    if(converterData->version==1){
2323        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2324        return;
2325    }
2326
2327    /* initialize data */
2328    sharedData = converterData->currentConverter->sharedData;
2329    useFallback = args->converter->useFallback;
2330    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2331    oldIsTargetByteDBCS = isTargetByteDBCS;
2332
2333    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2334    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2335        goto getTrail;
2336    }
2337    while(source < sourceLimit){
2338
2339        targetByteUnit = missingCharMarker;
2340
2341        if(target < (unsigned char*) args->targetLimit){
2342            sourceChar = *source++;
2343
2344            /* do not convert SO/SI/ESC */
2345            if(IS_2022_CONTROL(sourceChar)) {
2346                /* callback(illegal) */
2347                *err=U_ILLEGAL_CHAR_FOUND;
2348                args->converter->fromUChar32=sourceChar;
2349                break;
2350            }
2351
2352            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2353            if(length < 0) {
2354                length = -length;  /* fallback */
2355            }
2356            /* only DBCS or SBCS characters are expected*/
2357            /* DB characters with high bit set to 1 are expected */
2358            if( length > 2 || length==0 ||
2359                (length == 1 && targetByteUnit > 0x7f) ||
2360                (length == 2 &&
2361                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2362                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2363            ) {
2364                targetByteUnit=missingCharMarker;
2365            }
2366            if (targetByteUnit != missingCharMarker){
2367
2368                oldIsTargetByteDBCS = isTargetByteDBCS;
2369                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2370                  /* append the shift sequence */
2371                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2372
2373                    if (isTargetByteDBCS)
2374                        *target++ = UCNV_SO;
2375                    else
2376                        *target++ = UCNV_SI;
2377                    if(offsets)
2378                        *(offsets++) = (int32_t)(source - args->source-1);
2379                }
2380                /* write the targetUniChar  to target */
2381                if(targetByteUnit <= 0x00FF){
2382                    if( target < targetLimit){
2383                        *(target++) = (unsigned char) targetByteUnit;
2384                        if(offsets){
2385                            *(offsets++) = (int32_t)(source - args->source-1);
2386                        }
2387
2388                    }else{
2389                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2390                        *err = U_BUFFER_OVERFLOW_ERROR;
2391                    }
2392                }else{
2393                    if(target < targetLimit){
2394                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2395                        if(offsets){
2396                            *(offsets++) = (int32_t)(source - args->source-1);
2397                        }
2398                        if(target < targetLimit){
2399                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2400                            if(offsets){
2401                                *(offsets++) = (int32_t)(source - args->source-1);
2402                            }
2403                        }else{
2404                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2405                            *err = U_BUFFER_OVERFLOW_ERROR;
2406                        }
2407                    }else{
2408                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2409                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2410                        *err = U_BUFFER_OVERFLOW_ERROR;
2411                    }
2412                }
2413
2414            }
2415            else{
2416                /* oops.. the code point is unassingned
2417                 * set the error and reason
2418                 */
2419
2420                /*check if the char is a First surrogate*/
2421                if(UTF_IS_SURROGATE(sourceChar)) {
2422                    if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2423getTrail:
2424                        /*look ahead to find the trail surrogate*/
2425                        if(source <  sourceLimit) {
2426                            /* test the following code unit */
2427                            UChar trail=(UChar) *source;
2428                            if(UTF_IS_SECOND_SURROGATE(trail)) {
2429                                source++;
2430                                sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2431                                *err = U_INVALID_CHAR_FOUND;
2432                                /* convert this surrogate code point */
2433                                /* exit this condition tree */
2434                            } else {
2435                                /* this is an unmatched lead code unit (1st surrogate) */
2436                                /* callback(illegal) */
2437                                *err=U_ILLEGAL_CHAR_FOUND;
2438                            }
2439                        } else {
2440                            /* no more input */
2441                            *err = U_ZERO_ERROR;
2442                        }
2443                    } else {
2444                        /* this is an unmatched trail code unit (2nd surrogate) */
2445                        /* callback(illegal) */
2446                        *err=U_ILLEGAL_CHAR_FOUND;
2447                    }
2448                } else {
2449                    /* callback(unassigned) for a BMP code point */
2450                    *err = U_INVALID_CHAR_FOUND;
2451                }
2452
2453                args->converter->fromUChar32=sourceChar;
2454                break;
2455            }
2456        } /* end if(myTargetIndex<myTargetLength) */
2457        else{
2458            *err =U_BUFFER_OVERFLOW_ERROR;
2459            break;
2460        }
2461
2462    }/* end while(mySourceIndex<mySourceLength) */
2463
2464    /*
2465     * the end of the input stream and detection of truncated input
2466     * are handled by the framework, but for ISO-2022-KR conversion
2467     * we need to be in ASCII mode at the very end
2468     *
2469     * conditions:
2470     *   successful
2471     *   not in ASCII mode
2472     *   end of input and no truncated input
2473     */
2474    if( U_SUCCESS(*err) &&
2475        isTargetByteDBCS &&
2476        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2477    ) {
2478        int32_t sourceIndex;
2479
2480        /* we are switching to ASCII */
2481        isTargetByteDBCS=FALSE;
2482
2483        /* get the source index of the last input character */
2484        /*
2485         * TODO this would be simpler and more reliable if we used a pair
2486         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2487         * so that we could simply use the prevSourceIndex here;
2488         * this code gives an incorrect result for the rare case of an unmatched
2489         * trail surrogate that is alone in the last buffer of the text stream
2490         */
2491        sourceIndex=(int32_t)(source-args->source);
2492        if(sourceIndex>0) {
2493            --sourceIndex;
2494            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2495                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2496            ) {
2497                --sourceIndex;
2498            }
2499        } else {
2500            sourceIndex=-1;
2501        }
2502
2503        fromUWriteUInt8(
2504            args->converter,
2505            SHIFT_IN_STR, 1,
2506            &target, (const char *)targetLimit,
2507            &offsets, sourceIndex,
2508            err);
2509    }
2510
2511    /*save the state and return */
2512    args->source = source;
2513    args->target = (char*)target;
2514    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2515}
2516
2517/************************ To Unicode ***************************************/
2518
2519static void
2520UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2521                                                            UErrorCode* err){
2522    char const* sourceStart;
2523    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2524
2525    UConverterToUnicodeArgs subArgs;
2526    int32_t minArgsSize;
2527
2528    /* set up the subconverter arguments */
2529    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2530        minArgsSize = args->size;
2531    } else {
2532        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2533    }
2534
2535    uprv_memcpy(&subArgs, args, minArgsSize);
2536    subArgs.size = (uint16_t)minArgsSize;
2537    subArgs.converter = myData->currentConverter;
2538
2539    /* remember the original start of the input for offsets */
2540    sourceStart = args->source;
2541
2542    if(myData->key != 0) {
2543        /* continue with a partial escape sequence */
2544        goto escape;
2545    }
2546
2547    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2548        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2549        subArgs.source = args->source;
2550        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2551        if(subArgs.source != subArgs.sourceLimit) {
2552            /*
2553             * get the current partial byte sequence
2554             *
2555             * it needs to be moved between the public and the subconverter
2556             * so that the conversion framework, which only sees the public
2557             * converter, can handle truncated and illegal input etc.
2558             */
2559            if(args->converter->toULength > 0) {
2560                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2561            }
2562            subArgs.converter->toULength = args->converter->toULength;
2563
2564            /*
2565             * Convert up to the end of the input, or to before the next escape character.
2566             * Does not handle conversion extensions because the preToU[] state etc.
2567             * is not copied.
2568             */
2569            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2570
2571            if(args->offsets != NULL && sourceStart != args->source) {
2572                /* update offsets to base them on the actual start of the input */
2573                int32_t *offsets = args->offsets;
2574                UChar *target = args->target;
2575                int32_t delta = (int32_t)(args->source - sourceStart);
2576                while(target < subArgs.target) {
2577                    if(*offsets >= 0) {
2578                        *offsets += delta;
2579                    }
2580                    ++offsets;
2581                    ++target;
2582                }
2583            }
2584            args->source = subArgs.source;
2585            args->target = subArgs.target;
2586            args->offsets = subArgs.offsets;
2587
2588            /* copy input/error/overflow buffers */
2589            if(subArgs.converter->toULength > 0) {
2590                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2591            }
2592            args->converter->toULength = subArgs.converter->toULength;
2593
2594            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2595                if(subArgs.converter->UCharErrorBufferLength > 0) {
2596                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2597                                subArgs.converter->UCharErrorBufferLength);
2598                }
2599                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2600                subArgs.converter->UCharErrorBufferLength = 0;
2601            }
2602        }
2603
2604        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2605            return;
2606        }
2607
2608escape:
2609        changeState_2022(args->converter,
2610               &(args->source),
2611               args->sourceLimit,
2612               ISO_2022_KR,
2613               err);
2614    }
2615}
2616
2617static void
2618UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2619                                                            UErrorCode* err){
2620    char tempBuf[2];
2621    const char *mySource = ( char *) args->source;
2622    UChar *myTarget = args->target;
2623    const char *mySourceLimit = args->sourceLimit;
2624    UChar32 targetUniChar = 0x0000;
2625    UChar mySourceChar = 0x0000;
2626    UConverterDataISO2022* myData;
2627    UConverterSharedData* sharedData ;
2628    UBool useFallback;
2629
2630    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2631    if(myData->version==1){
2632        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2633        return;
2634    }
2635
2636    /* initialize state */
2637    sharedData = myData->currentConverter->sharedData;
2638    useFallback = args->converter->useFallback;
2639
2640    if(myData->key != 0) {
2641        /* continue with a partial escape sequence */
2642        goto escape;
2643    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2644        /* continue with a partial double-byte character */
2645        mySourceChar = args->converter->toUBytes[0];
2646        args->converter->toULength = 0;
2647        goto getTrailByte;
2648    }
2649
2650    while(mySource< mySourceLimit){
2651
2652        if(myTarget < args->targetLimit){
2653
2654            mySourceChar= (unsigned char) *mySource++;
2655
2656            if(mySourceChar==UCNV_SI){
2657                myData->toU2022State.g = 0;
2658                if (myData->isEmptySegment) {
2659                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
2660                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2661                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2662                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2663                    args->converter->toULength = 1;
2664                    args->target = myTarget;
2665                    args->source = mySource;
2666                    return;
2667                }
2668                /*consume the source */
2669                continue;
2670            }else if(mySourceChar==UCNV_SO){
2671                myData->toU2022State.g = 1;
2672                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
2673                /*consume the source */
2674                continue;
2675            }else if(mySourceChar==ESC_2022){
2676                mySource--;
2677escape:
2678                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2679                changeState_2022(args->converter,&(mySource),
2680                                mySourceLimit, ISO_2022_KR, err);
2681                if(U_FAILURE(*err)){
2682                    args->target = myTarget;
2683                    args->source = mySource;
2684                    return;
2685                }
2686                continue;
2687            }
2688
2689            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
2690            if(myData->toU2022State.g == 1) {
2691                if(mySource < mySourceLimit) {
2692                    int leadIsOk, trailIsOk;
2693                    uint8_t trailByte;
2694getTrailByte:
2695                    targetUniChar = missingCharMarker;
2696                    trailByte = (uint8_t)*mySource;
2697                    /*
2698                     * Ticket 5691: consistent illegal sequences:
2699                     * - We include at least the first byte in the illegal sequence.
2700                     * - If any of the non-initial bytes could be the start of a character,
2701                     *   we stop the illegal sequence before the first one of those.
2702                     *
2703                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2704                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2705                     * Otherwise we convert or report the pair of bytes.
2706                     */
2707                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2708                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2709                    if (leadIsOk && trailIsOk) {
2710                        ++mySource;
2711                        tempBuf[0] = (char)(mySourceChar + 0x80);
2712                        tempBuf[1] = (char)(trailByte + 0x80);
2713                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2714                        mySourceChar = (mySourceChar << 8) | trailByte;
2715                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2716                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2717                        ++mySource;
2718                        /* add another bit so that the code below writes 2 bytes in case of error */
2719                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2720                    }
2721                } else {
2722                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2723                    args->converter->toULength = 1;
2724                    break;
2725                }
2726            }
2727            else if(mySourceChar <= 0x7f) {
2728                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2729            } else {
2730                targetUniChar = 0xffff;
2731            }
2732            if(targetUniChar < 0xfffe){
2733                if(args->offsets) {
2734                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2735                }
2736                *(myTarget++)=(UChar)targetUniChar;
2737            }
2738            else {
2739                /* Call the callback function*/
2740                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2741                break;
2742            }
2743        }
2744        else{
2745            *err =U_BUFFER_OVERFLOW_ERROR;
2746            break;
2747        }
2748    }
2749    args->target = myTarget;
2750    args->source = mySource;
2751}
2752
2753/*************************** END ISO2022-KR *********************************/
2754
2755/*************************** ISO-2022-CN *********************************
2756*
2757* Rules for ISO-2022-CN Encoding:
2758* i)   The designator sequence must appear once on a line before any instance
2759*      of character set it designates.
2760* ii)  If two lines contain characters from the same character set, both lines
2761*      must include the designator sequence.
2762* iii) Once the designator sequence is known, a shifting sequence has to be found
2763*      to invoke the  shifting
2764* iv)  All lines start in ASCII and end in ASCII.
2765* v)   Four shifting sequences are employed for this purpose:
2766*
2767*      Sequcence   ASCII Eq    Charsets
2768*      ----------  -------    ---------
2769*      SI           <SI>        US-ASCII
2770*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2771*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2772*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2773*
2774* vi)
2775*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2776*      SS2designator : ESC "$" "*" finalchar_for_SS2
2777*      SS3designator : ESC "$" "+" finalchar_for_SS3
2778*
2779*      ESC $ ) A       Indicates the bytes following SO are Chinese
2780*       characters as defined in GB 2312-80, until
2781*       another SOdesignation appears
2782*
2783*
2784*      ESC $ ) E       Indicates the bytes following SO are as defined
2785*       in ISO-IR-165 (for details, see section 2.1),
2786*       until another SOdesignation appears
2787*
2788*      ESC $ ) G       Indicates the bytes following SO are as defined
2789*       in CNS 11643-plane-1, until another
2790*       SOdesignation appears
2791*
2792*      ESC $ * H       Indicates the two bytes immediately following
2793*       SS2 is a Chinese character as defined in CNS
2794*       11643-plane-2, until another SS2designation
2795*       appears
2796*       (Meaning <ESC>N must preceed every 2 byte
2797*        sequence.)
2798*
2799*      ESC $ + I       Indicates the immediate two bytes following SS3
2800*       is a Chinese character as defined in CNS
2801*       11643-plane-3, until another SS3designation
2802*       appears
2803*       (Meaning <ESC>O must preceed every 2 byte
2804*        sequence.)
2805*
2806*      ESC $ + J       Indicates the immediate two bytes following SS3
2807*       is a Chinese character as defined in CNS
2808*       11643-plane-4, until another SS3designation
2809*       appears
2810*       (In English: <ESC>O must preceed every 2 byte
2811*        sequence.)
2812*
2813*      ESC $ + K       Indicates the immediate two bytes following SS3
2814*       is a Chinese character as defined in CNS
2815*       11643-plane-5, until another SS3designation
2816*       appears
2817*
2818*      ESC $ + L       Indicates the immediate two bytes following SS3
2819*       is a Chinese character as defined in CNS
2820*       11643-plane-6, until another SS3designation
2821*       appears
2822*
2823*      ESC $ + M       Indicates the immediate two bytes following SS3
2824*       is a Chinese character as defined in CNS
2825*       11643-plane-7, until another SS3designation
2826*       appears
2827*
2828*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2829*       has its own designation information before any Chinese characters
2830*       appear
2831*
2832*/
2833
2834/* The following are defined this way to make the strings truely readonly */
2835static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2836static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2837static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2838static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2839static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2840static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2841static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2842static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2843static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2844
2845/********************** ISO2022-CN Data **************************/
2846static const char* const escSeqCharsCN[10] ={
2847        SHIFT_IN_STR,           /* ASCII */
2848        GB_2312_80_STR,
2849        ISO_IR_165_STR,
2850        CNS_11643_1992_Plane_1_STR,
2851        CNS_11643_1992_Plane_2_STR,
2852        CNS_11643_1992_Plane_3_STR,
2853        CNS_11643_1992_Plane_4_STR,
2854        CNS_11643_1992_Plane_5_STR,
2855        CNS_11643_1992_Plane_6_STR,
2856        CNS_11643_1992_Plane_7_STR
2857};
2858
2859static void
2860UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2861    UConverter *cnv = args->converter;
2862    UConverterDataISO2022 *converterData;
2863    ISO2022State *pFromU2022State;
2864    uint8_t *target = (uint8_t *) args->target;
2865    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2866    const UChar* source = args->source;
2867    const UChar* sourceLimit = args->sourceLimit;
2868    int32_t* offsets = args->offsets;
2869    UChar32 sourceChar;
2870    char buffer[8];
2871    int32_t len;
2872    int8_t choices[3];
2873    int32_t choiceCount;
2874    uint32_t targetValue = 0;
2875    UBool useFallback;
2876
2877    /* set up the state */
2878    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2879    pFromU2022State   = &converterData->fromU2022State;
2880
2881    choiceCount = 0;
2882
2883    /* check if the last codepoint of previous buffer was a lead surrogate*/
2884    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2885        goto getTrail;
2886    }
2887
2888    while( source < sourceLimit){
2889        if(target < targetLimit){
2890
2891            sourceChar  = *(source++);
2892            /*check if the char is a First surrogate*/
2893             if(UTF_IS_SURROGATE(sourceChar)) {
2894                if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2895getTrail:
2896                    /*look ahead to find the trail surrogate*/
2897                    if(source < sourceLimit) {
2898                        /* test the following code unit */
2899                        UChar trail=(UChar) *source;
2900                        if(UTF_IS_SECOND_SURROGATE(trail)) {
2901                            source++;
2902                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2903                            cnv->fromUChar32=0x00;
2904                            /* convert this supplementary code point */
2905                            /* exit this condition tree */
2906                        } else {
2907                            /* this is an unmatched lead code unit (1st surrogate) */
2908                            /* callback(illegal) */
2909                            *err=U_ILLEGAL_CHAR_FOUND;
2910                            cnv->fromUChar32=sourceChar;
2911                            break;
2912                        }
2913                    } else {
2914                        /* no more input */
2915                        cnv->fromUChar32=sourceChar;
2916                        break;
2917                    }
2918                } else {
2919                    /* this is an unmatched trail code unit (2nd surrogate) */
2920                    /* callback(illegal) */
2921                    *err=U_ILLEGAL_CHAR_FOUND;
2922                    cnv->fromUChar32=sourceChar;
2923                    break;
2924                }
2925            }
2926
2927            /* do the conversion */
2928            if(sourceChar <= 0x007f ){
2929                /* do not convert SO/SI/ESC */
2930                if(IS_2022_CONTROL(sourceChar)) {
2931                    /* callback(illegal) */
2932                    *err=U_ILLEGAL_CHAR_FOUND;
2933                    cnv->fromUChar32=sourceChar;
2934                    break;
2935                }
2936
2937                /* US-ASCII */
2938                if(pFromU2022State->g == 0) {
2939                    buffer[0] = (char)sourceChar;
2940                    len = 1;
2941                } else {
2942                    buffer[0] = UCNV_SI;
2943                    buffer[1] = (char)sourceChar;
2944                    len = 2;
2945                    pFromU2022State->g = 0;
2946                    choiceCount = 0;
2947                }
2948                if(sourceChar == CR || sourceChar == LF) {
2949                    /* reset the state at the end of a line */
2950                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2951                    choiceCount = 0;
2952                }
2953            }
2954            else{
2955                /* convert U+0080..U+10ffff */
2956                int32_t i;
2957                int8_t cs, g;
2958
2959                if(choiceCount == 0) {
2960                    /* try the current SO/G1 converter first */
2961                    choices[0] = pFromU2022State->cs[1];
2962
2963                    /* default to GB2312_1 if none is designated yet */
2964                    if(choices[0] == 0) {
2965                        choices[0] = GB2312_1;
2966                    }
2967
2968                    if(converterData->version == 0) {
2969                        /* ISO-2022-CN */
2970
2971                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2972                        if(choices[0] == GB2312_1) {
2973                            choices[1] = (int8_t)CNS_11643_1;
2974                        } else {
2975                            choices[1] = (int8_t)GB2312_1;
2976                        }
2977
2978                        choiceCount = 2;
2979                    } else if (converterData->version == 1) {
2980                        /* ISO-2022-CN-EXT */
2981
2982                        /* try one of the other converters */
2983                        switch(choices[0]) {
2984                        case GB2312_1:
2985                            choices[1] = (int8_t)CNS_11643_1;
2986                            choices[2] = (int8_t)ISO_IR_165;
2987                            break;
2988                        case ISO_IR_165:
2989                            choices[1] = (int8_t)GB2312_1;
2990                            choices[2] = (int8_t)CNS_11643_1;
2991                            break;
2992                        default: /* CNS_11643_x */
2993                            choices[1] = (int8_t)GB2312_1;
2994                            choices[2] = (int8_t)ISO_IR_165;
2995                            break;
2996                        }
2997
2998                        choiceCount = 3;
2999                    } else {
3000                        choices[0] = (int8_t)CNS_11643_1;
3001                        choices[1] = (int8_t)GB2312_1;
3002                    }
3003                }
3004
3005                cs = g = 0;
3006                /*
3007                 * len==0: no mapping found yet
3008                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3009                 * len>0: found a roundtrip result, done
3010                 */
3011                len = 0;
3012                /*
3013                 * We will turn off useFallback after finding a fallback,
3014                 * but we still get fallbacks from PUA code points as usual.
3015                 * Therefore, we will also need to check that we don't overwrite
3016                 * an early fallback with a later one.
3017                 */
3018                useFallback = cnv->useFallback;
3019
3020                for(i = 0; i < choiceCount && len <= 0; ++i) {
3021                    int8_t cs0 = choices[i];
3022                    if(cs0 > 0) {
3023                        uint32_t value;
3024                        int32_t len2;
3025                        if(cs0 >= CNS_11643_0) {
3026                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3027                                        converterData->myConverterArray[CNS_11643],
3028                                        sourceChar,
3029                                        &value,
3030                                        useFallback,
3031                                        MBCS_OUTPUT_3);
3032                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3033                                targetValue = value;
3034                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3035                                if(len2 >= 0) {
3036                                    len = 2;
3037                                } else {
3038                                    len = -2;
3039                                    useFallback = FALSE;
3040                                }
3041                                if(cs == CNS_11643_1) {
3042                                    g = 1;
3043                                } else if(cs == CNS_11643_2) {
3044                                    g = 2;
3045                                } else /* plane 3..7 */ if(converterData->version == 1) {
3046                                    g = 3;
3047                                } else {
3048                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3049                                    len = 0;
3050                                }
3051                            }
3052                        } else {
3053                            /* GB2312_1 or ISO-IR-165 */
3054                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3055                                        converterData->myConverterArray[cs0],
3056                                        sourceChar,
3057                                        &value,
3058                                        useFallback,
3059                                        MBCS_OUTPUT_2);
3060                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3061                                targetValue = value;
3062                                len = len2;
3063                                cs = cs0;
3064                                g = 1;
3065                                useFallback = FALSE;
3066                            }
3067                        }
3068                    }
3069                }
3070
3071                if(len != 0) {
3072                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3073
3074                    /* write the designation sequence if necessary */
3075                    if(cs != pFromU2022State->cs[g]) {
3076                        if(cs < CNS_11643) {
3077                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3078                        } else {
3079                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3080                        }
3081                        len = 4;
3082                        pFromU2022State->cs[g] = cs;
3083                        if(g == 1) {
3084                            /* changing the SO/G1 charset invalidates the choices[] */
3085                            choiceCount = 0;
3086                        }
3087                    }
3088
3089                    /* write the shift sequence if necessary */
3090                    if(g != pFromU2022State->g) {
3091                        switch(g) {
3092                        case 1:
3093                            buffer[len++] = UCNV_SO;
3094
3095                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3096                            pFromU2022State->g = 1;
3097                            break;
3098                        case 2:
3099                            buffer[len++] = 0x1b;
3100                            buffer[len++] = 0x4e;
3101                            break;
3102                        default: /* case 3 */
3103                            buffer[len++] = 0x1b;
3104                            buffer[len++] = 0x4f;
3105                            break;
3106                        }
3107                    }
3108
3109                    /* write the two output bytes */
3110                    buffer[len++] = (char)(targetValue >> 8);
3111                    buffer[len++] = (char)targetValue;
3112                } else {
3113                    /* if we cannot find the character after checking all codepages
3114                     * then this is an error
3115                     */
3116                    *err = U_INVALID_CHAR_FOUND;
3117                    cnv->fromUChar32=sourceChar;
3118                    break;
3119                }
3120            }
3121
3122            /* output len>0 bytes in buffer[] */
3123            if(len == 1) {
3124                *target++ = buffer[0];
3125                if(offsets) {
3126                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3127                }
3128            } else if(len == 2 && (target + 2) <= targetLimit) {
3129                *target++ = buffer[0];
3130                *target++ = buffer[1];
3131                if(offsets) {
3132                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3133                    *offsets++ = sourceIndex;
3134                    *offsets++ = sourceIndex;
3135                }
3136            } else {
3137                fromUWriteUInt8(
3138                    cnv,
3139                    buffer, len,
3140                    &target, (const char *)targetLimit,
3141                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3142                    err);
3143                if(U_FAILURE(*err)) {
3144                    break;
3145                }
3146            }
3147        } /* end if(myTargetIndex<myTargetLength) */
3148        else{
3149            *err =U_BUFFER_OVERFLOW_ERROR;
3150            break;
3151        }
3152
3153    }/* end while(mySourceIndex<mySourceLength) */
3154
3155    /*
3156     * the end of the input stream and detection of truncated input
3157     * are handled by the framework, but for ISO-2022-CN conversion
3158     * we need to be in ASCII mode at the very end
3159     *
3160     * conditions:
3161     *   successful
3162     *   not in ASCII mode
3163     *   end of input and no truncated input
3164     */
3165    if( U_SUCCESS(*err) &&
3166        pFromU2022State->g!=0 &&
3167        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3168    ) {
3169        int32_t sourceIndex;
3170
3171        /* we are switching to ASCII */
3172        pFromU2022State->g=0;
3173
3174        /* get the source index of the last input character */
3175        /*
3176         * TODO this would be simpler and more reliable if we used a pair
3177         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3178         * so that we could simply use the prevSourceIndex here;
3179         * this code gives an incorrect result for the rare case of an unmatched
3180         * trail surrogate that is alone in the last buffer of the text stream
3181         */
3182        sourceIndex=(int32_t)(source-args->source);
3183        if(sourceIndex>0) {
3184            --sourceIndex;
3185            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3186                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3187            ) {
3188                --sourceIndex;
3189            }
3190        } else {
3191            sourceIndex=-1;
3192        }
3193
3194        fromUWriteUInt8(
3195            cnv,
3196            SHIFT_IN_STR, 1,
3197            &target, (const char *)targetLimit,
3198            &offsets, sourceIndex,
3199            err);
3200    }
3201
3202    /*save the state and return */
3203    args->source = source;
3204    args->target = (char*)target;
3205}
3206
3207
3208static void
3209UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3210                                               UErrorCode* err){
3211    char tempBuf[3];
3212    const char *mySource = (char *) args->source;
3213    UChar *myTarget = args->target;
3214    const char *mySourceLimit = args->sourceLimit;
3215    uint32_t targetUniChar = 0x0000;
3216    uint32_t mySourceChar = 0x0000;
3217    UConverterDataISO2022* myData;
3218    ISO2022State *pToU2022State;
3219
3220    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3221    pToU2022State = &myData->toU2022State;
3222
3223    if(myData->key != 0) {
3224        /* continue with a partial escape sequence */
3225        goto escape;
3226    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3227        /* continue with a partial double-byte character */
3228        mySourceChar = args->converter->toUBytes[0];
3229        args->converter->toULength = 0;
3230        targetUniChar = missingCharMarker;
3231        goto getTrailByte;
3232    }
3233
3234    while(mySource < mySourceLimit){
3235
3236        targetUniChar =missingCharMarker;
3237
3238        if(myTarget < args->targetLimit){
3239
3240            mySourceChar= (unsigned char) *mySource++;
3241
3242            switch(mySourceChar){
3243            case UCNV_SI:
3244                pToU2022State->g=0;
3245                if (myData->isEmptySegment) {
3246                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
3247                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3248                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3249                    args->converter->toUBytes[0] = mySourceChar;
3250                    args->converter->toULength = 1;
3251                    args->target = myTarget;
3252                    args->source = mySource;
3253                    return;
3254                }
3255                continue;
3256
3257            case UCNV_SO:
3258                if(pToU2022State->cs[1] != 0) {
3259                    pToU2022State->g=1;
3260                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
3261                    continue;
3262                } else {
3263                    /* illegal to have SO before a matching designator */
3264                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
3265                    break;
3266                }
3267
3268            case ESC_2022:
3269                mySource--;
3270escape:
3271                {
3272                    const char * mySourceBefore = mySource;
3273                    int8_t toULengthBefore = args->converter->toULength;
3274
3275                    changeState_2022(args->converter,&(mySource),
3276                        mySourceLimit, ISO_2022_CN,err);
3277
3278                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3279                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3280                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3281                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3282                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3283                    }
3284                }
3285
3286                /* invalid or illegal escape sequence */
3287                if(U_FAILURE(*err)){
3288                    args->target = myTarget;
3289                    args->source = mySource;
3290                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
3291                    return;
3292                }
3293                continue;
3294
3295            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3296
3297            case CR:
3298                /*falls through*/
3299            case LF:
3300                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3301                /* falls through */
3302            default:
3303                /* convert one or two bytes */
3304                myData->isEmptySegment = FALSE;
3305                if(pToU2022State->g != 0) {
3306                    if(mySource < mySourceLimit) {
3307                        UConverterSharedData *cnv;
3308                        StateEnum tempState;
3309                        int32_t tempBufLen;
3310                        int leadIsOk, trailIsOk;
3311                        uint8_t trailByte;
3312getTrailByte:
3313                        trailByte = (uint8_t)*mySource;
3314                        /*
3315                         * Ticket 5691: consistent illegal sequences:
3316                         * - We include at least the first byte in the illegal sequence.
3317                         * - If any of the non-initial bytes could be the start of a character,
3318                         *   we stop the illegal sequence before the first one of those.
3319                         *
3320                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3321                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3322                         * Otherwise we convert or report the pair of bytes.
3323                         */
3324                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3325                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3326                        if (leadIsOk && trailIsOk) {
3327                            ++mySource;
3328                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3329                            if(tempState >= CNS_11643_0) {
3330                                cnv = myData->myConverterArray[CNS_11643];
3331                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3332                                tempBuf[1] = (char) (mySourceChar);
3333                                tempBuf[2] = (char) trailByte;
3334                                tempBufLen = 3;
3335
3336                            }else{
3337                                cnv = myData->myConverterArray[tempState];
3338                                tempBuf[0] = (char) (mySourceChar);
3339                                tempBuf[1] = (char) trailByte;
3340                                tempBufLen = 2;
3341                            }
3342                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3343                            mySourceChar = (mySourceChar << 8) | trailByte;
3344                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3345                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3346                            ++mySource;
3347                            /* add another bit so that the code below writes 2 bytes in case of error */
3348                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3349                        }
3350                        if(pToU2022State->g>=2) {
3351                            /* return from a single-shift state to the previous one */
3352                            pToU2022State->g=pToU2022State->prevG;
3353                        }
3354                    } else {
3355                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3356                        args->converter->toULength = 1;
3357                        goto endloop;
3358                    }
3359                }
3360                else{
3361                    if(mySourceChar <= 0x7f) {
3362                        targetUniChar = (UChar) mySourceChar;
3363                    }
3364                }
3365                break;
3366            }
3367            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3368                if(args->offsets){
3369                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3370                }
3371                *(myTarget++)=(UChar)targetUniChar;
3372            }
3373            else if(targetUniChar > missingCharMarker){
3374                /* disassemble the surrogate pair and write to output*/
3375                targetUniChar-=0x0010000;
3376                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3377                if(args->offsets){
3378                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3379                }
3380                ++myTarget;
3381                if(myTarget< args->targetLimit){
3382                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3383                    if(args->offsets){
3384                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3385                    }
3386                    ++myTarget;
3387                }else{
3388                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3389                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3390                }
3391
3392            }
3393            else{
3394                /* Call the callback function*/
3395                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3396                break;
3397            }
3398        }
3399        else{
3400            *err =U_BUFFER_OVERFLOW_ERROR;
3401            break;
3402        }
3403    }
3404endloop:
3405    args->target = myTarget;
3406    args->source = mySource;
3407}
3408
3409static void
3410_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3411    UConverter *cnv = args->converter;
3412    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3413    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3414    char *p, *subchar;
3415    char buffer[8];
3416    int32_t length;
3417
3418    subchar=(char *)cnv->subChars;
3419    length=cnv->subCharLen; /* assume length==1 for most variants */
3420
3421    p = buffer;
3422    switch(myConverterData->locale[0]){
3423    case 'j':
3424        {
3425            int8_t cs;
3426
3427            if(pFromU2022State->g == 1) {
3428                /* JIS7: switch from G1 to G0 */
3429                pFromU2022State->g = 0;
3430                *p++ = UCNV_SI;
3431            }
3432
3433            cs = pFromU2022State->cs[0];
3434            if(cs != ASCII && cs != JISX201) {
3435                /* not in ASCII or JIS X 0201: switch to ASCII */
3436                pFromU2022State->cs[0] = (int8_t)ASCII;
3437                *p++ = '\x1b';
3438                *p++ = '\x28';
3439                *p++ = '\x42';
3440            }
3441
3442            *p++ = subchar[0];
3443            break;
3444        }
3445    case 'c':
3446        if(pFromU2022State->g != 0) {
3447            /* not in ASCII mode: switch to ASCII */
3448            pFromU2022State->g = 0;
3449            *p++ = UCNV_SI;
3450        }
3451        *p++ = subchar[0];
3452        break;
3453    case 'k':
3454        if(myConverterData->version == 0) {
3455            if(length == 1) {
3456                if((UBool)args->converter->fromUnicodeStatus) {
3457                    /* in DBCS mode: switch to SBCS */
3458                    args->converter->fromUnicodeStatus = 0;
3459                    *p++ = UCNV_SI;
3460                }
3461                *p++ = subchar[0];
3462            } else /* length == 2*/ {
3463                if(!(UBool)args->converter->fromUnicodeStatus) {
3464                    /* in SBCS mode: switch to DBCS */
3465                    args->converter->fromUnicodeStatus = 1;
3466                    *p++ = UCNV_SO;
3467                }
3468                *p++ = subchar[0];
3469                *p++ = subchar[1];
3470            }
3471            break;
3472        } else {
3473            /* save the subconverter's substitution string */
3474            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3475            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3476
3477            /* set our substitution string into the subconverter */
3478            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3479            myConverterData->currentConverter->subCharLen = (int8_t)length;
3480
3481            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3482            args->converter = myConverterData->currentConverter;
3483            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3484            ucnv_cbFromUWriteSub(args, 0, err);
3485            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3486            args->converter = cnv;
3487
3488            /* restore the subconverter's substitution string */
3489            myConverterData->currentConverter->subChars = currentSubChars;
3490            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3491
3492            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3493                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3494                    uprv_memcpy(
3495                        cnv->charErrorBuffer,
3496                        myConverterData->currentConverter->charErrorBuffer,
3497                        myConverterData->currentConverter->charErrorBufferLength);
3498                }
3499                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3500                myConverterData->currentConverter->charErrorBufferLength = 0;
3501            }
3502            return;
3503        }
3504    default:
3505        /* not expected */
3506        break;
3507    }
3508    ucnv_cbFromUWriteBytes(args,
3509                           buffer, (int32_t)(p - buffer),
3510                           offsetIndex, err);
3511}
3512
3513/*
3514 * Structure for cloning an ISO 2022 converter into a single memory block.
3515 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3516 * and then ucnv_safeClone() of the sub-converter may additionally align
3517 * currentConverter inside the cloneStruct, for which we need the deadSpace
3518 * after currentConverter.
3519 * This is because UAlignedMemory may be larger than the actually
3520 * necessary alignment size for the platform.
3521 * The other cloneStruct fields will not be moved around,
3522 * and are aligned properly with cloneStruct's alignment.
3523 */
3524struct cloneStruct
3525{
3526    UConverter cnv;
3527    UConverter currentConverter;
3528    UAlignedMemory deadSpace;
3529    UConverterDataISO2022 mydata;
3530};
3531
3532
3533static UConverter *
3534_ISO_2022_SafeClone(
3535            const UConverter *cnv,
3536            void *stackBuffer,
3537            int32_t *pBufferSize,
3538            UErrorCode *status)
3539{
3540    struct cloneStruct * localClone;
3541    UConverterDataISO2022 *cnvData;
3542    int32_t i, size;
3543
3544    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3545        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3546        return NULL;
3547    }
3548
3549    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3550    localClone = (struct cloneStruct *)stackBuffer;
3551
3552    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3553
3554    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3555    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3556    localClone->cnv.isExtraLocal = TRUE;
3557
3558    /* share the subconverters */
3559
3560    if(cnvData->currentConverter != NULL) {
3561        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3562        localClone->mydata.currentConverter =
3563            ucnv_safeClone(cnvData->currentConverter,
3564                            &localClone->currentConverter,
3565                            &size, status);
3566        if(U_FAILURE(*status)) {
3567            return NULL;
3568        }
3569    }
3570
3571    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3572        if(cnvData->myConverterArray[i] != NULL) {
3573            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3574        }
3575    }
3576
3577    return &localClone->cnv;
3578}
3579
3580static void
3581_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3582                    const USetAdder *sa,
3583                    UConverterUnicodeSet which,
3584                    UErrorCode *pErrorCode)
3585{
3586    int32_t i;
3587    UConverterDataISO2022* cnvData;
3588
3589    if (U_FAILURE(*pErrorCode)) {
3590        return;
3591    }
3592#ifdef U_ENABLE_GENERIC_ISO_2022
3593    if (cnv->sharedData == &_ISO2022Data) {
3594        /* We use UTF-8 in this case */
3595        sa->addRange(sa->set, 0, 0xd7FF);
3596        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3597        return;
3598    }
3599#endif
3600
3601    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3602
3603    /* open a set and initialize it with code points that are algorithmically round-tripped */
3604    switch(cnvData->locale[0]){
3605    case 'j':
3606        /* include JIS X 0201 which is hardcoded */
3607        sa->add(sa->set, 0xa5);
3608        sa->add(sa->set, 0x203e);
3609        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3610            /* include Latin-1 for some variants of JP */
3611            sa->addRange(sa->set, 0, 0xff);
3612        } else {
3613            /* include ASCII for JP */
3614            sa->addRange(sa->set, 0, 0x7f);
3615        }
3616        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3617            /*
3618             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3619             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3620             * use half-width Katakana.
3621             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3622             * half-width Katakana via the ESC ( I sequence.
3623             * However, we only emit (fromUnicode) half-width Katakana according to the
3624             * definition of each variant.
3625             *
3626             * When including fallbacks,
3627             * we need to include half-width Katakana Unicode code points for all JP variants because
3628             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3629             */
3630            /* include half-width Katakana for JP */
3631            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3632        }
3633        break;
3634    case 'c':
3635    case 'z':
3636        /* include ASCII for CN */
3637        sa->addRange(sa->set, 0, 0x7f);
3638        break;
3639    case 'k':
3640        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3641        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3642                cnvData->currentConverter, sa, which, pErrorCode);
3643        /* the loop over myConverterArray[] will simply not find another converter */
3644        break;
3645    default:
3646        break;
3647    }
3648
3649#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3650            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3651                cnvData->version==0 && i==CNS_11643
3652            ) {
3653                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3654                ucnv_MBCSGetUnicodeSetForBytes(
3655                        cnvData->myConverterArray[i],
3656                        sa, UCNV_ROUNDTRIP_SET,
3657                        0, 0x81, 0x82,
3658                        pErrorCode);
3659            }
3660#endif
3661
3662    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3663        UConverterSetFilter filter;
3664        if(cnvData->myConverterArray[i]!=NULL) {
3665            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3666                cnvData->version==0 && i==CNS_11643
3667            ) {
3668                /*
3669                 * Version-specific for CN:
3670                 * CN version 0 does not map CNS planes 3..7 although
3671                 * they are all available in the CNS conversion table;
3672                 * CN version 1 (-EXT) does map them all.
3673                 * The two versions create different Unicode sets.
3674                 */
3675                filter=UCNV_SET_FILTER_2022_CN;
3676            } else if(cnvData->locale[0]=='j' && i==JISX208) {
3677                /*
3678                 * Only add code points that map to Shift-JIS codes
3679                 * corresponding to JIS X 0208.
3680                 */
3681                filter=UCNV_SET_FILTER_SJIS;
3682            } else if(i==KSC5601) {
3683                /*
3684                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3685                 * are broader than GR94.
3686                 */
3687                filter=UCNV_SET_FILTER_GR94DBCS;
3688            } else {
3689                filter=UCNV_SET_FILTER_NONE;
3690            }
3691            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3692        }
3693    }
3694
3695    /*
3696     * ISO 2022 converters must not convert SO/SI/ESC despite what
3697     * sub-converters do by themselves.
3698     * Remove these characters from the set.
3699     */
3700    sa->remove(sa->set, 0x0e);
3701    sa->remove(sa->set, 0x0f);
3702    sa->remove(sa->set, 0x1b);
3703
3704    /* ISO 2022 converters do not convert C1 controls either */
3705    sa->removeRange(sa->set, 0x80, 0x9f);
3706}
3707
3708static const UConverterImpl _ISO2022Impl={
3709    UCNV_ISO_2022,
3710
3711    NULL,
3712    NULL,
3713
3714    _ISO2022Open,
3715    _ISO2022Close,
3716    _ISO2022Reset,
3717
3718#ifdef U_ENABLE_GENERIC_ISO_2022
3719    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3720    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3721    ucnv_fromUnicode_UTF8,
3722    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3723#else
3724    NULL,
3725    NULL,
3726    NULL,
3727    NULL,
3728#endif
3729    NULL,
3730
3731    NULL,
3732    _ISO2022getName,
3733    _ISO_2022_WriteSub,
3734    _ISO_2022_SafeClone,
3735    _ISO_2022_GetUnicodeSet
3736};
3737static const UConverterStaticData _ISO2022StaticData={
3738    sizeof(UConverterStaticData),
3739    "ISO_2022",
3740    2022,
3741    UCNV_IBM,
3742    UCNV_ISO_2022,
3743    1,
3744    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3745    { 0x1a, 0, 0, 0 },
3746    1,
3747    FALSE,
3748    FALSE,
3749    0,
3750    0,
3751    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3752};
3753const UConverterSharedData _ISO2022Data={
3754    sizeof(UConverterSharedData),
3755    ~((uint32_t) 0),
3756    NULL,
3757    NULL,
3758    &_ISO2022StaticData,
3759    FALSE,
3760    &_ISO2022Impl,
3761    0
3762};
3763
3764/*************JP****************/
3765static const UConverterImpl _ISO2022JPImpl={
3766    UCNV_ISO_2022,
3767
3768    NULL,
3769    NULL,
3770
3771    _ISO2022Open,
3772    _ISO2022Close,
3773    _ISO2022Reset,
3774
3775    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3776    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3777    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3778    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3779    NULL,
3780
3781    NULL,
3782    _ISO2022getName,
3783    _ISO_2022_WriteSub,
3784    _ISO_2022_SafeClone,
3785    _ISO_2022_GetUnicodeSet
3786};
3787static const UConverterStaticData _ISO2022JPStaticData={
3788    sizeof(UConverterStaticData),
3789    "ISO_2022_JP",
3790    0,
3791    UCNV_IBM,
3792    UCNV_ISO_2022,
3793    1,
3794    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3795    { 0x1a, 0, 0, 0 },
3796    1,
3797    FALSE,
3798    FALSE,
3799    0,
3800    0,
3801    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3802};
3803static const UConverterSharedData _ISO2022JPData={
3804    sizeof(UConverterSharedData),
3805    ~((uint32_t) 0),
3806    NULL,
3807    NULL,
3808    &_ISO2022JPStaticData,
3809    FALSE,
3810    &_ISO2022JPImpl,
3811    0
3812};
3813
3814/************* KR ***************/
3815static const UConverterImpl _ISO2022KRImpl={
3816    UCNV_ISO_2022,
3817
3818    NULL,
3819    NULL,
3820
3821    _ISO2022Open,
3822    _ISO2022Close,
3823    _ISO2022Reset,
3824
3825    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3826    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3827    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3828    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3829    NULL,
3830
3831    NULL,
3832    _ISO2022getName,
3833    _ISO_2022_WriteSub,
3834    _ISO_2022_SafeClone,
3835    _ISO_2022_GetUnicodeSet
3836};
3837static const UConverterStaticData _ISO2022KRStaticData={
3838    sizeof(UConverterStaticData),
3839    "ISO_2022_KR",
3840    0,
3841    UCNV_IBM,
3842    UCNV_ISO_2022,
3843    1,
3844    3, /* max 3 bytes per UChar: SO+DBCS */
3845    { 0x1a, 0, 0, 0 },
3846    1,
3847    FALSE,
3848    FALSE,
3849    0,
3850    0,
3851    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3852};
3853static const UConverterSharedData _ISO2022KRData={
3854    sizeof(UConverterSharedData),
3855    ~((uint32_t) 0),
3856    NULL,
3857    NULL,
3858    &_ISO2022KRStaticData,
3859    FALSE,
3860    &_ISO2022KRImpl,
3861    0
3862};
3863
3864/*************** CN ***************/
3865static const UConverterImpl _ISO2022CNImpl={
3866
3867    UCNV_ISO_2022,
3868
3869    NULL,
3870    NULL,
3871
3872    _ISO2022Open,
3873    _ISO2022Close,
3874    _ISO2022Reset,
3875
3876    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3877    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3878    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3879    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3880    NULL,
3881
3882    NULL,
3883    _ISO2022getName,
3884    _ISO_2022_WriteSub,
3885    _ISO_2022_SafeClone,
3886    _ISO_2022_GetUnicodeSet
3887};
3888static const UConverterStaticData _ISO2022CNStaticData={
3889    sizeof(UConverterStaticData),
3890    "ISO_2022_CN",
3891    0,
3892    UCNV_IBM,
3893    UCNV_ISO_2022,
3894    1,
3895    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3896    { 0x1a, 0, 0, 0 },
3897    1,
3898    FALSE,
3899    FALSE,
3900    0,
3901    0,
3902    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3903};
3904static const UConverterSharedData _ISO2022CNData={
3905    sizeof(UConverterSharedData),
3906    ~((uint32_t) 0),
3907    NULL,
3908    NULL,
3909    &_ISO2022CNStaticData,
3910    FALSE,
3911    &_ISO2022CNImpl,
3912    0
3913};
3914
3915
3916
3917#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3918