1/*
2**********************************************************************
3*   Copyright (C) 2000-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv2022.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2000feb03
12*   created by: Markus W. Scherer
13*
14*   Change history:
15*
16*   06/29/2000  helena  Major rewrite of the callback APIs.
17*   08/08/2000  Ram     Included support for ISO-2022-JP-2
18*                       Changed implementation of toUnicode
19*                       function
20*   08/21/2000  Ram     Added support for ISO-2022-KR
21*   08/29/2000  Ram     Seperated implementation of EBCDIC to
22*                       ucnvebdc.c
23*   09/20/2000  Ram     Added support for ISO-2022-CN
24*                       Added implementations for getNextUChar()
25*                       for specific 2022 country variants.
26*   10/31/2000  Ram     Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
37#include "ucnv_imp.h"
38#include "ucnv_bld.h"
39#include "ucnv_cnv.h"
40#include "ucnvmbcs.h"
41#include "cstring.h"
42#include "cmemory.h"
43
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 *    its designation sequences, single shifts with return to the previous state,
54 *    switch-with-no-return to UTF-16BE or similar, etc.
55 *    This is unlike the language-specific variants like ISO-2022-JP which
56 *    require a much smaller repertoire of ISO-2022 features.
57 *    These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 *    but rather always one of the language-specific variants.
60 *    Note that ICU's generic ISO-2022 converter has always output one escape
61 *    sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 *    the previous converter is closed and a new one opened,
64 *    without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 *    reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 *    This means, for example, that when ISO-8859-7 is designated, the following
69 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 *    The ICU ISO-2022 converter does not handle this - and has no information
71 *    about which subconverter would have to be shifted vs. which is designed
72 *    for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
78static const char SHIFT_IN_STR[]  = "\x0F";
79static const char SHIFT_OUT_STR[] = "\x0E";
80
81#define CR      0x0D
82#define LF      0x0A
83#define H_TAB   0x09
84#define V_TAB   0x0B
85#define SPACE   0x20
86
87enum {
88    HWKANA_START=0xff61,
89    HWKANA_END=0xff9f
90};
91
92/*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100enum {
101    GR94_START=0xa1,
102    GR94_END=0xfe,
103    GR96_START=0xa0,
104    GR96_END=0xff
105};
106
107/*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115/* for ISO-2022-JP and -CN implementations */
116typedef enum  {
117        /* shared values */
118        INVALID_STATE=-1,
119        ASCII = 0,
120
121        SS2_STATE=0x10,
122        SS3_STATE,
123
124        /* JP */
125        ISO8859_1 = 1 ,
126        ISO8859_7 = 2 ,
127        JISX201  = 3,
128        JISX208 = 4,
129        JISX212 = 5,
130        GB2312  =6,
131        KSC5601 =7,
132        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
133
134        /* CN */
135        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136        GB2312_1=1,
137        ISO_IR_165=2,
138        CNS_11643=3,
139
140        /*
141         * these are used in StateEnum and ISO2022State variables,
142         * but CNS_11643 must be used to index into myConverterArray[]
143         */
144        CNS_11643_0=0x20,
145        CNS_11643_1,
146        CNS_11643_2,
147        CNS_11643_3,
148        CNS_11643_4,
149        CNS_11643_5,
150        CNS_11643_6,
151        CNS_11643_7
152} StateEnum;
153
154/* is the StateEnum charset value for a DBCS charset? */
155#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157#define CSM(cs) ((uint16_t)1<<(cs))
158
159/*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 *   all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168enum { MAX_JA_VERSION=4 };
169static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
170    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
171    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
172    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
174    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
175};
176
177typedef enum {
178        ASCII1=0,
179        LATIN1,
180        SBCS,
181        DBCS,
182        MBCS,
183        HWKANA
184}Cnv2022Type;
185
186typedef struct ISO2022State {
187    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189    int8_t prevG;       /* g before single shift (SS2 or SS3) */
190} ISO2022State;
191
192#define UCNV_OPTIONS_VERSION_MASK 0xf
193#define UCNV_2022_MAX_CONVERTERS 10
194
195typedef struct{
196    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
197    UConverter *currentConverter;
198    Cnv2022Type currentType;
199    ISO2022State toU2022State, fromU2022State;
200    uint32_t key;
201    uint32_t version;
202#ifdef U_ENABLE_GENERIC_ISO_2022
203    UBool isFirstBuffer;
204#endif
205    UBool isEmptySegment;
206    char name[30];
207    char locale[3];
208}UConverterDataISO2022;
209
210/* Protos */
211/* ISO-2022 ----------------------------------------------------------------- */
212
213/*Forward declaration */
214U_CFUNC void
215ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
216                      UErrorCode * err);
217U_CFUNC void
218ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
219                                    UErrorCode * err);
220
221#define ESC_2022 0x1B /*ESC*/
222
223typedef enum
224{
225        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
228        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229} UCNV_TableStates_2022;
230
231/*
232* The way these state transition arrays work is:
233* ex : ESC$B is the sequence for JISX208
234*      a) First Iteration: char is ESC
235*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236*             int x = normalize_esq_chars_2022[27] which is equal to 1
237*         ii) Search for this value in escSeqStateTable_Key_2022[]
238*             value of x is stored at escSeqStateTable_Key_2022[0]
239*        iii) Save this index as offset
240*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242*     b) Switch on this state and continue to next char
243*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244*             which is normalize_esq_chars_2022[36] == 4
245*         ii) x is currently 1(from above)
246*               x<<=5 -- x is now 32
247*               x+=normalize_esq_chars_2022[36]
248*               now x is 36
249*        iii) Search for this value in escSeqStateTable_Key_2022[]
250*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253*     c) Switch on this state and continue to next char
254*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255*        ii) x is currently 36 (from above)
256*            x<<=5 -- x is now 1152
257*            x+=normalize_esq_chars_2022[66]
258*            now x is 1161
259*       iii) Search for this value in escSeqStateTable_Key_2022[]
260*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
264*/
265
266
267/*Below are the 3 arrays depicting a state transition table*/
268static const int8_t normalize_esq_chars_2022[256] = {
269/*       0      1       2       3       4      5       6        7       8       9           */
270
271         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
272        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
273        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
274        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
275        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
276        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
277        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
278        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
279        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
280        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
281        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
282        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
283        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
284        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296        ,0     ,0      ,0      ,0      ,0      ,0
297};
298
299#ifdef U_ENABLE_GENERIC_ISO_2022
300/*
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
304 *
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 *
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
312 *
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
315 */
316#endif
317
318#define MAX_STATES_2022 74
319static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
320/*   0           1           2           3           4           5           6           7           8           9           */
321
322     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
323    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
324    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
325    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
326    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
327    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
328    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
329    ,35947631   ,35947635   ,35947636   ,35947638
330};
331
332#ifdef U_ENABLE_GENERIC_ISO_2022
333
334static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
335 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
336
337     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
338    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
339    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
340    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
341    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
342    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
344    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
345};
346
347#endif
348
349static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
350/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
351     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
352    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
353    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
354    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
355    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
356    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
357    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
358    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
359};
360
361
362/* Type def for refactoring changeState_2022 code*/
363typedef enum{
364#ifdef U_ENABLE_GENERIC_ISO_2022
365    ISO_2022=0,
366#endif
367    ISO_2022_JP=1,
368    ISO_2022_KR=2,
369    ISO_2022_CN=3
370} Variant2022;
371
372/*********** ISO 2022 Converter Protos ***********/
373static void
374_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
375
376static void
377 _ISO2022Close(UConverter *converter);
378
379static void
380_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
381
382static const char*
383_ISO2022getName(const UConverter* cnv);
384
385static void
386_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
387
388static UConverter *
389_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
390
391#ifdef U_ENABLE_GENERIC_ISO_2022
392static void
393T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
394#endif
395
396/*const UConverterSharedData _ISO2022Data;*/
397static const UConverterSharedData _ISO2022JPData;
398static const UConverterSharedData _ISO2022KRData;
399static const UConverterSharedData _ISO2022CNData;
400
401/*************** Converter implementations ******************/
402
403/* The purpose of this function is to get around gcc compiler warnings. */
404static U_INLINE void
405fromUWriteUInt8(UConverter *cnv,
406                 const char *bytes, int32_t length,
407                 uint8_t **target, const char *targetLimit,
408                 int32_t **offsets,
409                 int32_t sourceIndex,
410                 UErrorCode *pErrorCode)
411{
412    char *targetChars = (char *)*target;
413    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
414                         offsets, sourceIndex, pErrorCode);
415    *target = (uint8_t*)targetChars;
416
417}
418
419static U_INLINE void
420setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
421    if(myConverterData->version == 1) {
422        UConverter *cnv = myConverterData->currentConverter;
423
424        cnv->toUnicodeStatus=0;     /* offset */
425        cnv->mode=0;                /* state */
426        cnv->toULength=0;           /* byteIndex */
427    }
428}
429
430static U_INLINE void
431setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
432   /* in ISO-2022-KR the designator sequence appears only once
433    * in a file so we append it only once
434    */
435    if( converter->charErrorBufferLength==0){
436
437        converter->charErrorBufferLength = 4;
438        converter->charErrorBuffer[0] = 0x1b;
439        converter->charErrorBuffer[1] = 0x24;
440        converter->charErrorBuffer[2] = 0x29;
441        converter->charErrorBuffer[3] = 0x43;
442    }
443    if(myConverterData->version == 1) {
444        UConverter *cnv = myConverterData->currentConverter;
445
446        cnv->fromUChar32=0;
447        cnv->fromUnicodeStatus=1;   /* prevLength */
448    }
449}
450
451static void
452_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
453
454    char myLocale[6]={' ',' ',' ',' ',' ',' '};
455
456    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
457    if(cnv->extraInfo != NULL) {
458        UConverterNamePieces stackPieces;
459        UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
460        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
461        uint32_t version;
462
463        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
464
465        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
466        myConverterData->currentType = ASCII1;
467        cnv->fromUnicodeStatus =FALSE;
468        if(pArgs->locale){
469            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
470        }
471        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
472        myConverterData->version = version;
473
474        /* BEGIN android-changed */
475        /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
476        /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
477        if((myLocale[0]=='j' &&
478            (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
479             myLocale[1]=='s') &&
480            (myLocale[2]=='_' || myLocale[2]=='\0')))
481        {
482            size_t len=0;
483            /* open the required converters and cache them */
484            if(version>MAX_JA_VERSION) {
485                /* prevent indexing beyond jpCharsetMasks[] */
486                myConverterData->version = version = 0;
487            }
488            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
489                myConverterData->myConverterArray[ISO8859_7] =
490                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
491            }
492            if (myLocale[1]=='k') {  /* Use KDDI's version. */
493                myConverterData->myConverterArray[JISX208] =
494                    ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
495            } else if (myLocale[1]=='s') {  /* Use SoftBank's version. */
496                myConverterData->myConverterArray[JISX208] =
497                    ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
498            } else {
499                myConverterData->myConverterArray[JISX208] =
500                    ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
501            }
502            /* END android-changed */
503
504            if(jpCharsetMasks[version]&CSM(JISX212)) {
505                myConverterData->myConverterArray[JISX212] =
506                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
507            }
508            if(jpCharsetMasks[version]&CSM(GB2312)) {
509                myConverterData->myConverterArray[GB2312] =
510                    /* BEGIN android-changed */
511                    ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
512                    /* END android-changed */
513            }
514            if(jpCharsetMasks[version]&CSM(KSC5601)) {
515                myConverterData->myConverterArray[KSC5601] =
516                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
517            }
518
519            /* set the function pointers to appropriate funtions */
520            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
521            uprv_strcpy(myConverterData->locale,"ja");
522
523            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
524            len = uprv_strlen(myConverterData->name);
525            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
526            myConverterData->name[len+1]='\0';
527        }
528        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
529            (myLocale[2]=='_' || myLocale[2]=='\0'))
530        {
531            const char *cnvName;
532            if(version==1) {
533                cnvName="icu-internal-25546";
534            } else {
535                /* BEGIN android-changed */
536                cnvName="ksc_5601";
537                /* END android-changed */
538                myConverterData->version=version=0;
539            }
540            if(pArgs->onlyTestIsLoadable) {
541                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
542                uprv_free(cnv->extraInfo);
543                cnv->extraInfo=NULL;
544                return;
545            } else {
546                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
547                if (U_FAILURE(*errorCode)) {
548                    _ISO2022Close(cnv);
549                    return;
550                }
551
552                if(version==1) {
553                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
554                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
555                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
556                }else{
557                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
558                }
559
560                /* initialize the state variables */
561                setInitialStateToUnicodeKR(cnv, myConverterData);
562                setInitialStateFromUnicodeKR(cnv, myConverterData);
563
564                /* set the function pointers to appropriate funtions */
565                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
566                uprv_strcpy(myConverterData->locale,"ko");
567            }
568        }
569        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
570            (myLocale[2]=='_' || myLocale[2]=='\0'))
571        {
572
573            /* open the required converters and cache them */
574            /* BEGIN android-changed */
575            myConverterData->myConverterArray[GB2312_1] =
576                ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
577            if(version==1) {
578                myConverterData->myConverterArray[ISO_IR_165] =
579                    ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
580            }
581            myConverterData->myConverterArray[CNS_11643] =
582                ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
583            /* END android-changed */
584
585
586            /* set the function pointers to appropriate funtions */
587            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
588            uprv_strcpy(myConverterData->locale,"cn");
589
590            if (version==0){
591                myConverterData->version = 0;
592                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
593            }else if (version==1){
594                myConverterData->version = 1;
595                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
596            }else {
597                myConverterData->version = 2;
598                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
599            }
600        }
601        else{
602#ifdef U_ENABLE_GENERIC_ISO_2022
603            myConverterData->isFirstBuffer = TRUE;
604
605            /* append the UTF-8 escape sequence */
606            cnv->charErrorBufferLength = 3;
607            cnv->charErrorBuffer[0] = 0x1b;
608            cnv->charErrorBuffer[1] = 0x25;
609            cnv->charErrorBuffer[2] = 0x42;
610
611            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
612            /* initialize the state variables */
613            uprv_strcpy(myConverterData->name,"ISO_2022");
614#else
615            *errorCode = U_UNSUPPORTED_ERROR;
616            return;
617#endif
618        }
619
620        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
621
622        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
623            _ISO2022Close(cnv);
624        }
625    } else {
626        *errorCode = U_MEMORY_ALLOCATION_ERROR;
627    }
628}
629
630
631static void
632_ISO2022Close(UConverter *converter) {
633    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
634    UConverterSharedData **array = myData->myConverterArray;
635    int32_t i;
636
637    if (converter->extraInfo != NULL) {
638        /*close the array of converter pointers and free the memory*/
639        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
640            if(array[i]!=NULL) {
641                ucnv_unloadSharedDataIfReady(array[i]);
642            }
643        }
644
645        ucnv_close(myData->currentConverter);
646
647        if(!converter->isExtraLocal){
648            uprv_free (converter->extraInfo);
649            converter->extraInfo = NULL;
650        }
651    }
652}
653
654static void
655_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
656    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
657    if(choice<=UCNV_RESET_TO_UNICODE) {
658        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
659        myConverterData->key = 0;
660        myConverterData->isEmptySegment = FALSE;
661    }
662    if(choice!=UCNV_RESET_TO_UNICODE) {
663        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
664    }
665#ifdef U_ENABLE_GENERIC_ISO_2022
666    if(myConverterData->locale[0] == 0){
667        if(choice<=UCNV_RESET_TO_UNICODE) {
668            myConverterData->isFirstBuffer = TRUE;
669            myConverterData->key = 0;
670            if (converter->mode == UCNV_SO){
671                ucnv_close (myConverterData->currentConverter);
672                myConverterData->currentConverter=NULL;
673            }
674            converter->mode = UCNV_SI;
675        }
676        if(choice!=UCNV_RESET_TO_UNICODE) {
677            /* re-append UTF-8 escape sequence */
678            converter->charErrorBufferLength = 3;
679            converter->charErrorBuffer[0] = 0x1b;
680            converter->charErrorBuffer[1] = 0x28;
681            converter->charErrorBuffer[2] = 0x42;
682        }
683    }
684    else
685#endif
686    {
687        /* reset the state variables */
688        if(myConverterData->locale[0] == 'k'){
689            if(choice<=UCNV_RESET_TO_UNICODE) {
690                setInitialStateToUnicodeKR(converter, myConverterData);
691            }
692            if(choice!=UCNV_RESET_TO_UNICODE) {
693                setInitialStateFromUnicodeKR(converter, myConverterData);
694            }
695        }
696    }
697}
698
699static const char*
700_ISO2022getName(const UConverter* cnv){
701    if(cnv->extraInfo){
702        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
703        return myData->name;
704    }
705    return NULL;
706}
707
708
709/*************** to unicode *******************/
710/****************************************************************************
711 * Recognized escape sequences are
712 * <ESC>(B  ASCII
713 * <ESC>.A  ISO-8859-1
714 * <ESC>.F  ISO-8859-7
715 * <ESC>(J  JISX-201
716 * <ESC>(I  JISX-201
717 * <ESC>$B  JISX-208
718 * <ESC>$@  JISX-208
719 * <ESC>$(D JISX-212
720 * <ESC>$A  GB2312
721 * <ESC>$(C KSC5601
722 */
723static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
724/*      0                1               2               3               4               5               6               7               8               9    */
725    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
726    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
727    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
728    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
729    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
730    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
731    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
732    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
733};
734
735/*************** to unicode *******************/
736static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
737/*      0                1               2               3               4               5               6               7               8               9    */
738     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
739    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
740    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
741    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
742    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
743    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
744    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
745    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
746};
747
748
749static UCNV_TableStates_2022
750getKey_2022(char c,int32_t* key,int32_t* offset){
751    int32_t togo;
752    int32_t low = 0;
753    int32_t hi = MAX_STATES_2022;
754    int32_t oldmid=0;
755
756    togo = normalize_esq_chars_2022[(uint8_t)c];
757    if(togo == 0) {
758        /* not a valid character anywhere in an escape sequence */
759        *key = 0;
760        *offset = 0;
761        return INVALID_2022;
762    }
763    togo = (*key << 5) + togo;
764
765    while (hi != low)  /*binary search*/{
766
767        register int32_t mid = (hi+low) >> 1; /*Finds median*/
768
769        if (mid == oldmid)
770            break;
771
772        if (escSeqStateTable_Key_2022[mid] > togo){
773            hi = mid;
774        }
775        else if (escSeqStateTable_Key_2022[mid] < togo){
776            low = mid;
777        }
778        else /*we found it*/{
779            *key = togo;
780            *offset = mid;
781            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
782        }
783        oldmid = mid;
784
785    }
786
787    *key = 0;
788    *offset = 0;
789    return INVALID_2022;
790}
791
792/*runs through a state machine to determine the escape sequence - codepage correspondance
793 */
794static void
795changeState_2022(UConverter* _this,
796                const char** source,
797                const char* sourceLimit,
798                Variant2022 var,
799                UErrorCode* err){
800    UCNV_TableStates_2022 value;
801    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
802    uint32_t key = myData2022->key;
803    int32_t offset = 0;
804    int8_t initialToULength = _this->toULength;
805    char c;
806
807    value = VALID_NON_TERMINAL_2022;
808    while (*source < sourceLimit) {
809        c = *(*source)++;
810        _this->toUBytes[_this->toULength++]=(uint8_t)c;
811        value = getKey_2022(c,(int32_t *) &key, &offset);
812
813        switch (value){
814
815        case VALID_NON_TERMINAL_2022 :
816            /* continue with the loop */
817            break;
818
819        case VALID_TERMINAL_2022:
820            key = 0;
821            goto DONE;
822
823        case INVALID_2022:
824            goto DONE;
825
826        case VALID_MAYBE_TERMINAL_2022:
827#ifdef U_ENABLE_GENERIC_ISO_2022
828            /* ESC ( B is ambiguous only for ISO_2022 itself */
829            if(var == ISO_2022) {
830                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
831                _this->toULength = 0;
832
833                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
834
835                /* continue with the loop */
836                value = VALID_NON_TERMINAL_2022;
837                break;
838            } else
839#endif
840            {
841                /* not ISO_2022 itself, finish here */
842                value = VALID_TERMINAL_2022;
843                key = 0;
844                goto DONE;
845            }
846        }
847    }
848
849DONE:
850    myData2022->key = key;
851
852    if (value == VALID_NON_TERMINAL_2022) {
853        /* indicate that the escape sequence is incomplete: key!=0 */
854        return;
855    } else if (value == INVALID_2022 ) {
856        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
857    } else /* value == VALID_TERMINAL_2022 */ {
858        switch(var){
859#ifdef U_ENABLE_GENERIC_ISO_2022
860        case ISO_2022:
861        {
862            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
863            if(chosenConverterName == NULL) {
864                /* SS2 or SS3 */
865                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
866                _this->toUCallbackReason = UCNV_UNASSIGNED;
867                return;
868            }
869
870            _this->mode = UCNV_SI;
871            ucnv_close(myData2022->currentConverter);
872            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
873            if(U_SUCCESS(*err)) {
874                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
875                _this->mode = UCNV_SO;
876            }
877            break;
878        }
879#endif
880        case ISO_2022_JP:
881            {
882                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
883                switch(tempState) {
884                case INVALID_STATE:
885                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
886                    break;
887                case SS2_STATE:
888                    if(myData2022->toU2022State.cs[2]!=0) {
889                        if(myData2022->toU2022State.g<2) {
890                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
891                        }
892                        myData2022->toU2022State.g=2;
893                    } else {
894                        /* illegal to have SS2 before a matching designator */
895                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
896                    }
897                    break;
898                /* case SS3_STATE: not used in ISO-2022-JP-x */
899                case ISO8859_1:
900                case ISO8859_7:
901                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
902                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
903                    } else {
904                        /* G2 charset for SS2 */
905                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
906                    }
907                    break;
908                default:
909                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
910                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
911                    } else {
912                        /* G0 charset */
913                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
914                    }
915                    break;
916                }
917            }
918            break;
919        case ISO_2022_CN:
920            {
921                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
922                switch(tempState) {
923                case INVALID_STATE:
924                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
925                    break;
926                case SS2_STATE:
927                    if(myData2022->toU2022State.cs[2]!=0) {
928                        if(myData2022->toU2022State.g<2) {
929                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
930                        }
931                        myData2022->toU2022State.g=2;
932                    } else {
933                        /* illegal to have SS2 before a matching designator */
934                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
935                    }
936                    break;
937                case SS3_STATE:
938                    if(myData2022->toU2022State.cs[3]!=0) {
939                        if(myData2022->toU2022State.g<2) {
940                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
941                        }
942                        myData2022->toU2022State.g=3;
943                    } else {
944                        /* illegal to have SS3 before a matching designator */
945                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
946                    }
947                    break;
948                case ISO_IR_165:
949                    if(myData2022->version==0) {
950                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
951                        break;
952                    }
953                    /*fall through*/
954                case GB2312_1:
955                    /*fall through*/
956                case CNS_11643_1:
957                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
958                    break;
959                case CNS_11643_2:
960                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
961                    break;
962                default:
963                    /* other CNS 11643 planes */
964                    if(myData2022->version==0) {
965                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
966                    } else {
967                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
968                    }
969                    break;
970                }
971            }
972            break;
973        case ISO_2022_KR:
974            if(offset==0x30){
975                /* nothing to be done, just accept this one escape sequence */
976            } else {
977                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
978            }
979            break;
980
981        default:
982            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
983            break;
984        }
985    }
986    if(U_SUCCESS(*err)) {
987        _this->toULength = 0;
988    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
989        if(_this->toULength>1) {
990            /*
991             * Ticket 5691: consistent illegal sequences:
992             * - We include at least the first byte (ESC) in the illegal sequence.
993             * - If any of the non-initial bytes could be the start of a character,
994             *   we stop the illegal sequence before the first one of those.
995             *   In escape sequences, all following bytes are "printable", that is,
996             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
997             *   they are valid single/lead bytes.
998             *   For simplicity, we always only report the initial ESC byte as the
999             *   illegal sequence and back out all other bytes we looked at.
1000             */
1001            /* Back out some bytes. */
1002            int8_t backOutDistance=_this->toULength-1;
1003            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1004            if(backOutDistance<=bytesFromThisBuffer) {
1005                /* same as initialToULength<=1 */
1006                *source-=backOutDistance;
1007            } else {
1008                /* Back out bytes from the previous buffer: Need to replay them. */
1009                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1010                /* same as -(initialToULength-1) */
1011                /* preToULength is negative! */
1012                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1013                *source-=bytesFromThisBuffer;
1014            }
1015            _this->toULength=1;
1016        }
1017    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1018        _this->toUCallbackReason = UCNV_UNASSIGNED;
1019    }
1020}
1021
1022/*Checks the characters of the buffer against valid 2022 escape sequences
1023*if the match we return a pointer to the initial start of the sequence otherwise
1024*we return sourceLimit
1025*/
1026/*for 2022 looks ahead in the stream
1027 *to determine the longest possible convertible
1028 *data stream
1029 */
1030static U_INLINE const char*
1031getEndOfBuffer_2022(const char** source,
1032                   const char* sourceLimit,
1033                   UBool flush){
1034
1035    const char* mySource = *source;
1036
1037#ifdef U_ENABLE_GENERIC_ISO_2022
1038    if (*source >= sourceLimit)
1039        return sourceLimit;
1040
1041    do{
1042
1043        if (*mySource == ESC_2022){
1044            int8_t i;
1045            int32_t key = 0;
1046            int32_t offset;
1047            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1048
1049            /* Kludge: I could not
1050            * figure out the reason for validating an escape sequence
1051            * twice - once here and once in changeState_2022().
1052            * is it possible to have an ESC character in a ISO2022
1053            * byte stream which is valid in a code page? Is it legal?
1054            */
1055            for (i=0;
1056            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1057            i++) {
1058                value =  getKey_2022(*(mySource+i), &key, &offset);
1059            }
1060            if (value > 0 || *mySource==ESC_2022)
1061                return mySource;
1062
1063            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1064                return sourceLimit;
1065        }
1066    }while (++mySource < sourceLimit);
1067
1068    return sourceLimit;
1069#else
1070    while(mySource < sourceLimit && *mySource != ESC_2022) {
1071        ++mySource;
1072    }
1073    return mySource;
1074#endif
1075}
1076
1077
1078/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1079 * any future change in _MBCSFromUChar32() function should be reflected here.
1080 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1081 */
1082static U_INLINE int32_t
1083MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1084                                         UChar32 c,
1085                                         uint32_t* value,
1086                                         UBool useFallback,
1087                                         int outputType)
1088{
1089    const int32_t *cx;
1090    const uint16_t *table;
1091    uint32_t stage2Entry;
1092    uint32_t myValue;
1093    int32_t length;
1094    const uint8_t *p;
1095    /*
1096     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1097     * Use internal version of ucnv_open() that verifies that the new structures are available,
1098     * else U_INTERNAL_PROGRAM_ERROR.
1099     */
1100    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1101    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1102        table=sharedData->mbcs.fromUnicodeTable;
1103        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1104        /* get the bytes and the length for the output */
1105        if(outputType==MBCS_OUTPUT_2){
1106            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1107            if(myValue<=0xff) {
1108                length=1;
1109            } else {
1110                length=2;
1111            }
1112        } else /* outputType==MBCS_OUTPUT_3 */ {
1113            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1114            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1115            if(myValue<=0xff) {
1116                length=1;
1117            } else if(myValue<=0xffff) {
1118                length=2;
1119            } else {
1120                length=3;
1121            }
1122        }
1123        /* is this code point assigned, or do we use fallbacks? */
1124        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1125            /* assigned */
1126            *value=myValue;
1127            return length;
1128        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1129            /*
1130             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1131             * There is no way with this data structure for fallback output
1132             * to be a zero byte.
1133             */
1134            *value=myValue;
1135            return -length;
1136        }
1137    }
1138
1139    cx=sharedData->mbcs.extIndexes;
1140    if(cx!=NULL) {
1141        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1142    }
1143
1144    /* unassigned */
1145    return 0;
1146}
1147
1148/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1149 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1150 * @param retval pointer to output byte
1151 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1152 */
1153static U_INLINE int32_t
1154MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1155                                       UChar32 c,
1156                                       uint32_t* retval,
1157                                       UBool useFallback)
1158{
1159    const uint16_t *table;
1160    int32_t value;
1161    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1162    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1163        return 0;
1164    }
1165    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1166    table=sharedData->mbcs.fromUnicodeTable;
1167    /* get the byte for the output */
1168    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1169    /* is this code point assigned, or do we use fallbacks? */
1170    *retval=(uint32_t)(value&0xff);
1171    if(value>=0xf00) {
1172        return 1;  /* roundtrip */
1173    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1174        return -1;  /* fallback taken */
1175    } else {
1176        return 0;  /* no mapping */
1177    }
1178}
1179
1180/*
1181 * Check that the result is a 2-byte value with each byte in the range A1..FE
1182 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1183 * to move it to the ISO 2022 range 21..7E.
1184 * Return 0 if out of range.
1185 */
1186static U_INLINE uint32_t
1187_2022FromGR94DBCS(uint32_t value) {
1188    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1189        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1190    ) {
1191        return value - 0x8080;  /* shift down to 21..7e byte range */
1192    } else {
1193        return 0;  /* not valid for ISO 2022 */
1194    }
1195}
1196
1197#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1198/*
1199 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1200 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1201 * unchanged.
1202 */
1203static U_INLINE uint32_t
1204_2022ToGR94DBCS(uint32_t value) {
1205    uint32_t returnValue = value + 0x8080;
1206    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1207        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1208        return returnValue;
1209    } else {
1210        return value;
1211    }
1212}
1213#endif
1214
1215#ifdef U_ENABLE_GENERIC_ISO_2022
1216
1217/**********************************************************************************
1218*  ISO-2022 Converter
1219*
1220*
1221*/
1222
1223static void
1224T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1225                                                           UErrorCode* err){
1226    const char* mySourceLimit, *realSourceLimit;
1227    const char* sourceStart;
1228    const UChar* myTargetStart;
1229    UConverter* saveThis;
1230    UConverterDataISO2022* myData;
1231    int8_t length;
1232
1233    saveThis = args->converter;
1234    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1235
1236    realSourceLimit = args->sourceLimit;
1237    while (args->source < realSourceLimit) {
1238        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1239            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1240            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1241
1242            if(args->source < mySourceLimit) {
1243                if(myData->currentConverter==NULL) {
1244                    myData->currentConverter = ucnv_open("ASCII",err);
1245                    if(U_FAILURE(*err)){
1246                        return;
1247                    }
1248
1249                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1250                    saveThis->mode = UCNV_SO;
1251                }
1252
1253                /* convert to before the ESC or until the end of the buffer */
1254                myData->isFirstBuffer=FALSE;
1255                sourceStart = args->source;
1256                myTargetStart = args->target;
1257                args->converter = myData->currentConverter;
1258                ucnv_toUnicode(args->converter,
1259                    &args->target,
1260                    args->targetLimit,
1261                    &args->source,
1262                    mySourceLimit,
1263                    args->offsets,
1264                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1265                    err);
1266                args->converter = saveThis;
1267
1268                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1269                    /* move the overflow buffer */
1270                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1271                    myData->currentConverter->UCharErrorBufferLength = 0;
1272                    if(length > 0) {
1273                        uprv_memcpy(saveThis->UCharErrorBuffer,
1274                                    myData->currentConverter->UCharErrorBuffer,
1275                                    length*U_SIZEOF_UCHAR);
1276                    }
1277                    return;
1278                }
1279
1280                /*
1281                 * At least one of:
1282                 * -Error while converting
1283                 * -Done with entire buffer
1284                 * -Need to write offsets or update the current offset
1285                 *  (leave that up to the code in ucnv.c)
1286                 *
1287                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1288                 */
1289                if (U_FAILURE(*err) ||
1290                    (args->source == realSourceLimit) ||
1291                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1292                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1293                ) {
1294                    /* copy partial or error input for truncated detection and error handling */
1295                    if(U_FAILURE(*err)) {
1296                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1297                        if(length > 0) {
1298                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1299                        }
1300                    } else {
1301                        length = saveThis->toULength = myData->currentConverter->toULength;
1302                        if(length > 0) {
1303                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1304                            if(args->source < mySourceLimit) {
1305                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1306                            }
1307                        }
1308                    }
1309                    return;
1310                }
1311            }
1312        }
1313
1314        sourceStart = args->source;
1315        changeState_2022(args->converter,
1316               &(args->source),
1317               realSourceLimit,
1318               ISO_2022,
1319               err);
1320        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1321            /* let the ucnv.c code update its current offset */
1322            return;
1323        }
1324    }
1325}
1326
1327#endif
1328
1329/*
1330 * To Unicode Callback helper function
1331 */
1332static void
1333toUnicodeCallback(UConverter *cnv,
1334                  const uint32_t sourceChar, const uint32_t targetUniChar,
1335                  UErrorCode* err){
1336    if(sourceChar>0xff){
1337        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1338        cnv->toUBytes[1] = (uint8_t)sourceChar;
1339        cnv->toULength = 2;
1340    }
1341    else{
1342        cnv->toUBytes[0] =(char) sourceChar;
1343        cnv->toULength = 1;
1344    }
1345
1346    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1347        *err = U_INVALID_CHAR_FOUND;
1348    }
1349    else{
1350        *err = U_ILLEGAL_CHAR_FOUND;
1351    }
1352}
1353
1354/**************************************ISO-2022-JP*************************************************/
1355
1356/************************************** IMPORTANT **************************************************
1357* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1358* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1359* The converter iterates over each Unicode codepoint
1360* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1361* processed one char at a time it would make sense to reduce the extra processing a canned converter
1362* would do as far as possible.
1363*
1364* If the implementation of these macros or structure of sharedData struct change in the future, make
1365* sure that ISO-2022 is also changed.
1366***************************************************************************************************
1367*/
1368
1369/***************************************************************************************************
1370* Rules for ISO-2022-jp encoding
1371* (i)   Escape sequences must be fully contained within a line they should not
1372*       span new lines or CRs
1373* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1374*       JIS-Roman character escape sequence should follow before the line terminates
1375* (iii) If the first character on the line is represented by two bytes then a two
1376*       byte character escape sequence should precede it
1377* (iv)  If no escape sequence is encountered then the characters are ASCII
1378* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1379*       and invoked with SS2 (ESC N).
1380* (vi)  If there is any G0 designation in text, there must be a switch to
1381*       ASCII or to JIS X 0201-Roman before a space character (but not
1382*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1383*       characters such as tab or CRLF.
1384* (vi)  Supported encodings:
1385*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1386*
1387*  source : RFC-1554
1388*
1389*          JISX201, JISX208,JISX212 : new .cnv data files created
1390*          KSC5601 : alias to ibm-949 mapping table
1391*          GB2312 : alias to ibm-1386 mapping table
1392*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1393*          ISO-8859-7 : alisas to ibm-9409 mapping table
1394*/
1395
1396/* preference order of JP charsets */
1397static const StateEnum jpCharsetPref[]={
1398    ASCII,
1399    JISX201,
1400    ISO8859_1,
1401    ISO8859_7,
1402    JISX208,
1403    JISX212,
1404    GB2312,
1405    KSC5601,
1406    HWKANA_7BIT
1407};
1408
1409/*
1410 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1411 * not in order of jpCharsetPref[]!
1412 */
1413static const char escSeqChars[][6] ={
1414    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1415    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1416    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1417    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1418    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1419    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1420    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1421    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1422    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1423
1424};
1425static  const int8_t escSeqCharsLen[] ={
1426    3, /* length of <ESC>(B  ASCII       */
1427    3, /* length of <ESC>.A  ISO-8859-1  */
1428    3, /* length of <ESC>.F  ISO-8859-7  */
1429    3, /* length of <ESC>(J  JISX-201    */
1430    3, /* length of <ESC>$B  JISX-208    */
1431    4, /* length of <ESC>$(D JISX-212    */
1432    3, /* length of <ESC>$A  GB2312      */
1433    4, /* length of <ESC>$(C KSC5601     */
1434    3  /* length of <ESC>(I  HWKANA_7BIT */
1435};
1436
1437/*
1438* The iteration over various code pages works this way:
1439* i)   Get the currentState from myConverterData->currentState
1440* ii)  Check if the character is mapped to a valid character in the currentState
1441*      Yes ->  a) set the initIterState to currentState
1442*       b) remain in this state until an invalid character is found
1443*      No  ->  a) go to the next code page and find the character
1444* iii) Before changing the state increment the current state check if the current state
1445*      is equal to the intitIteration state
1446*      Yes ->  A character that cannot be represented in any of the supported encodings
1447*       break and return a U_INVALID_CHARACTER error
1448*      No  ->  Continue and find the character in next code page
1449*
1450*
1451* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1452*/
1453
1454/* Map 00..7F to Unicode according to JIS X 0201. */
1455static U_INLINE uint32_t
1456jisx201ToU(uint32_t value) {
1457    if(value < 0x5c) {
1458        return value;
1459    } else if(value == 0x5c) {
1460        return 0xa5;
1461    } else if(value == 0x7e) {
1462        return 0x203e;
1463    } else /* value <= 0x7f */ {
1464        return value;
1465    }
1466}
1467
1468/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1469static U_INLINE uint32_t
1470jisx201FromU(uint32_t value) {
1471    if(value<=0x7f) {
1472        if(value!=0x5c && value!=0x7e) {
1473            return value;
1474        }
1475    } else if(value==0xa5) {
1476        return 0x5c;
1477    } else if(value==0x203e) {
1478        return 0x7e;
1479    }
1480    return 0xfffe;
1481}
1482
1483/*
1484 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1485 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1486 * Return 0 if the byte pair is out of range.
1487 */
1488static U_INLINE uint32_t
1489_2022FromSJIS(uint32_t value) {
1490    uint8_t trail;
1491
1492    if(value > 0xEFFC) {
1493        return 0;  /* beyond JIS X 0208 */
1494    }
1495
1496    trail = (uint8_t)value;
1497
1498    value &= 0xff00;  /* lead byte */
1499    if(value <= 0x9f00) {
1500        value -= 0x7000;
1501    } else /* 0xe000 <= value <= 0xef00 */ {
1502        value -= 0xb000;
1503    }
1504    value <<= 1;
1505
1506    if(trail <= 0x9e) {
1507        value -= 0x100;
1508        if(trail <= 0x7e) {
1509            value |= trail - 0x1f;
1510        } else {
1511            value |= trail - 0x20;
1512        }
1513    } else /* trail <= 0xfc */ {
1514        value |= trail - 0x7e;
1515    }
1516    return value;
1517}
1518
1519/*
1520 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1521 * If either byte is outside 21..7E make sure that the result is not valid
1522 * for Shift-JIS so that the converter catches it.
1523 * Some invalid byte values already turn into equally invalid Shift-JIS
1524 * byte values and need not be tested explicitly.
1525 */
1526static U_INLINE void
1527_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1528    if(c1&1) {
1529        ++c1;
1530        if(c2 <= 0x5f) {
1531            c2 += 0x1f;
1532        } else if(c2 <= 0x7e) {
1533            c2 += 0x20;
1534        } else {
1535            c2 = 0;  /* invalid */
1536        }
1537    } else {
1538        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1539            c2 += 0x7e;
1540        } else {
1541            c2 = 0;  /* invalid */
1542        }
1543    }
1544    c1 >>= 1;
1545    if(c1 <= 0x2f) {
1546        c1 += 0x70;
1547    } else if(c1 <= 0x3f) {
1548        c1 += 0xb0;
1549    } else {
1550        c1 = 0;  /* invalid */
1551    }
1552    bytes[0] = (char)c1;
1553    bytes[1] = (char)c2;
1554}
1555
1556/*
1557 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1558 * Katakana.
1559 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1560 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1561 * These were the only fallbacks in ICU's jisx-208.ucm file.
1562 */
1563static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1564    0x2123,  /* U+FF61 */
1565    0x2156,
1566    0x2157,
1567    0x2122,
1568    0x2126,
1569    0x2572,
1570    0x2521,
1571    0x2523,
1572    0x2525,
1573    0x2527,
1574    0x2529,
1575    0x2563,
1576    0x2565,
1577    0x2567,
1578    0x2543,
1579    0x213C,  /* U+FF70 */
1580    0x2522,
1581    0x2524,
1582    0x2526,
1583    0x2528,
1584    0x252A,
1585    0x252B,
1586    0x252D,
1587    0x252F,
1588    0x2531,
1589    0x2533,
1590    0x2535,
1591    0x2537,
1592    0x2539,
1593    0x253B,
1594    0x253D,
1595    0x253F,  /* U+FF80 */
1596    0x2541,
1597    0x2544,
1598    0x2546,
1599    0x2548,
1600    0x254A,
1601    0x254B,
1602    0x254C,
1603    0x254D,
1604    0x254E,
1605    0x254F,
1606    0x2552,
1607    0x2555,
1608    0x2558,
1609    0x255B,
1610    0x255E,
1611    0x255F,  /* U+FF90 */
1612    0x2560,
1613    0x2561,
1614    0x2562,
1615    0x2564,
1616    0x2566,
1617    0x2568,
1618    0x2569,
1619    0x256A,
1620    0x256B,
1621    0x256C,
1622    0x256D,
1623    0x256F,
1624    0x2573,
1625    0x212B,
1626    0x212C   /* U+FF9F */
1627};
1628
1629static void
1630UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1631    UConverter *cnv = args->converter;
1632    UConverterDataISO2022 *converterData;
1633    ISO2022State *pFromU2022State;
1634    uint8_t *target = (uint8_t *) args->target;
1635    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1636    const UChar* source = args->source;
1637    const UChar* sourceLimit = args->sourceLimit;
1638    int32_t* offsets = args->offsets;
1639    UChar32 sourceChar;
1640    char buffer[8];
1641    int32_t len, outLen;
1642    int8_t choices[10];
1643    int32_t choiceCount;
1644    uint32_t targetValue = 0;
1645    UBool useFallback;
1646
1647    int32_t i;
1648    int8_t cs, g;
1649
1650    /* set up the state */
1651    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1652    pFromU2022State   = &converterData->fromU2022State;
1653
1654    choiceCount = 0;
1655
1656    /* check if the last codepoint of previous buffer was a lead surrogate*/
1657    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1658        goto getTrail;
1659    }
1660
1661    while(source < sourceLimit) {
1662        if(target < targetLimit) {
1663
1664            sourceChar  = *(source++);
1665            /*check if the char is a First surrogate*/
1666            if(UTF_IS_SURROGATE(sourceChar)) {
1667                if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1668getTrail:
1669                    /*look ahead to find the trail surrogate*/
1670                    if(source < sourceLimit) {
1671                        /* test the following code unit */
1672                        UChar trail=(UChar) *source;
1673                        if(UTF_IS_SECOND_SURROGATE(trail)) {
1674                            source++;
1675                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1676                            cnv->fromUChar32=0x00;
1677                            /* convert this supplementary code point */
1678                            /* exit this condition tree */
1679                        } else {
1680                            /* this is an unmatched lead code unit (1st surrogate) */
1681                            /* callback(illegal) */
1682                            *err=U_ILLEGAL_CHAR_FOUND;
1683                            cnv->fromUChar32=sourceChar;
1684                            break;
1685                        }
1686                    } else {
1687                        /* no more input */
1688                        cnv->fromUChar32=sourceChar;
1689                        break;
1690                    }
1691                } else {
1692                    /* this is an unmatched trail code unit (2nd surrogate) */
1693                    /* callback(illegal) */
1694                    *err=U_ILLEGAL_CHAR_FOUND;
1695                    cnv->fromUChar32=sourceChar;
1696                    break;
1697                }
1698            }
1699
1700            /* do not convert SO/SI/ESC */
1701            if(IS_2022_CONTROL(sourceChar)) {
1702                /* callback(illegal) */
1703                *err=U_ILLEGAL_CHAR_FOUND;
1704                cnv->fromUChar32=sourceChar;
1705                break;
1706            }
1707
1708            /* do the conversion */
1709
1710            if(choiceCount == 0) {
1711                uint16_t csm;
1712
1713                /*
1714                 * The csm variable keeps track of which charsets are allowed
1715                 * and not used yet while building the choices[].
1716                 */
1717                csm = jpCharsetMasks[converterData->version];
1718                choiceCount = 0;
1719
1720                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1721                if(converterData->version == 3 || converterData->version == 4) {
1722                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1723                }
1724                /* Do not try single-byte half-width Katakana for other versions. */
1725                csm &= ~CSM(HWKANA_7BIT);
1726
1727                /* try the current G0 charset */
1728                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1729                csm &= ~CSM(cs);
1730
1731                /* try the current G2 charset */
1732                if((cs = pFromU2022State->cs[2]) != 0) {
1733                    choices[choiceCount++] = cs;
1734                    csm &= ~CSM(cs);
1735                }
1736
1737                /* try all the other possible charsets */
1738                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1739                    cs = (int8_t)jpCharsetPref[i];
1740                    if(CSM(cs) & csm) {
1741                        choices[choiceCount++] = cs;
1742                        csm &= ~CSM(cs);
1743                    }
1744                }
1745            }
1746
1747            cs = g = 0;
1748            /*
1749             * len==0: no mapping found yet
1750             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1751             * len>0: found a roundtrip result, done
1752             */
1753            len = 0;
1754            /*
1755             * We will turn off useFallback after finding a fallback,
1756             * but we still get fallbacks from PUA code points as usual.
1757             * Therefore, we will also need to check that we don't overwrite
1758             * an early fallback with a later one.
1759             */
1760            useFallback = cnv->useFallback;
1761
1762            for(i = 0; i < choiceCount && len <= 0; ++i) {
1763                uint32_t value;
1764                int32_t len2;
1765                int8_t cs0 = choices[i];
1766                switch(cs0) {
1767                case ASCII:
1768                    if(sourceChar <= 0x7f) {
1769                        targetValue = (uint32_t)sourceChar;
1770                        len = 1;
1771                        cs = cs0;
1772                        g = 0;
1773                    }
1774                    break;
1775                case ISO8859_1:
1776                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1777                        targetValue = (uint32_t)sourceChar - 0x80;
1778                        len = 1;
1779                        cs = cs0;
1780                        g = 2;
1781                    }
1782                    break;
1783                case HWKANA_7BIT:
1784                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1785                        if(converterData->version==3) {
1786                            /* JIS7: use G1 (SO) */
1787                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1788                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1789                            len = 1;
1790                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1791                            g = 1;
1792                        } else if(converterData->version==4) {
1793                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1794                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1795                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1796                            len = 1;
1797
1798                            cs = pFromU2022State->cs[0];
1799                            if(IS_JP_DBCS(cs)) {
1800                                /* switch from a DBCS charset to JISX201 */
1801                                cs = (int8_t)JISX201;
1802                            }
1803                            /* else stay in the current G0 charset */
1804                            g = 0;
1805                        }
1806                        /* else do not use HWKANA_7BIT with other versions */
1807                    }
1808                    break;
1809                case JISX201:
1810                    /* G0 SBCS */
1811                    value = jisx201FromU(sourceChar);
1812                    if(value <= 0x7f) {
1813                        targetValue = value;
1814                        len = 1;
1815                        cs = cs0;
1816                        g = 0;
1817                        useFallback = FALSE;
1818                    }
1819                    break;
1820                case JISX208:
1821                    /* G0 DBCS from Shift-JIS table */
1822                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1823                                converterData->myConverterArray[cs0],
1824                                sourceChar, &value,
1825                                useFallback, MBCS_OUTPUT_2);
1826                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1827                        value = _2022FromSJIS(value);
1828                        if(value != 0) {
1829                            targetValue = value;
1830                            len = len2;
1831                            cs = cs0;
1832                            g = 0;
1833                            useFallback = FALSE;
1834                        }
1835                    } else if(len == 0 && useFallback &&
1836                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1837                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1838                        len = -2;
1839                        cs = cs0;
1840                        g = 0;
1841                        useFallback = FALSE;
1842                    }
1843                    break;
1844                case ISO8859_7:
1845                    /* G0 SBCS forced to 7-bit output */
1846                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1847                                converterData->myConverterArray[cs0],
1848                                sourceChar, &value,
1849                                useFallback);
1850                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1851                        targetValue = value - 0x80;
1852                        len = len2;
1853                        cs = cs0;
1854                        g = 2;
1855                        useFallback = FALSE;
1856                    }
1857                    break;
1858                default:
1859                    /* G0 DBCS */
1860                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1861                                converterData->myConverterArray[cs0],
1862                                sourceChar, &value,
1863                                useFallback, MBCS_OUTPUT_2);
1864                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1865                        if(cs0 == KSC5601) {
1866                            /*
1867                             * Check for valid bytes for the encoding scheme.
1868                             * This is necessary because the sub-converter (windows-949)
1869                             * has a broader encoding scheme than is valid for 2022.
1870                             */
1871                            value = _2022FromGR94DBCS(value);
1872                            if(value == 0) {
1873                                break;
1874                            }
1875                        }
1876                        targetValue = value;
1877                        len = len2;
1878                        cs = cs0;
1879                        g = 0;
1880                        useFallback = FALSE;
1881                    }
1882                    break;
1883                }
1884            }
1885
1886            if(len != 0) {
1887                if(len < 0) {
1888                    len = -len;  /* fallback */
1889                }
1890                outLen = 0; /* count output bytes */
1891
1892                /* write SI if necessary (only for JIS7) */
1893                if(pFromU2022State->g == 1 && g == 0) {
1894                    buffer[outLen++] = UCNV_SI;
1895                    pFromU2022State->g = 0;
1896                }
1897
1898                /* write the designation sequence if necessary */
1899                if(cs != pFromU2022State->cs[g]) {
1900                    int32_t escLen = escSeqCharsLen[cs];
1901                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1902                    outLen += escLen;
1903                    pFromU2022State->cs[g] = cs;
1904
1905                    /* invalidate the choices[] */
1906                    choiceCount = 0;
1907                }
1908
1909                /* write the shift sequence if necessary */
1910                if(g != pFromU2022State->g) {
1911                    switch(g) {
1912                    /* case 0 handled before writing escapes */
1913                    case 1:
1914                        buffer[outLen++] = UCNV_SO;
1915                        pFromU2022State->g = 1;
1916                        break;
1917                    default: /* case 2 */
1918                        buffer[outLen++] = 0x1b;
1919                        buffer[outLen++] = 0x4e;
1920                        break;
1921                    /* no case 3: no SS3 in ISO-2022-JP-x */
1922                    }
1923                }
1924
1925                /* write the output bytes */
1926                if(len == 1) {
1927                    buffer[outLen++] = (char)targetValue;
1928                } else /* len == 2 */ {
1929                    buffer[outLen++] = (char)(targetValue >> 8);
1930                    buffer[outLen++] = (char)targetValue;
1931                }
1932            } else {
1933                /*
1934                 * if we cannot find the character after checking all codepages
1935                 * then this is an error
1936                 */
1937                *err = U_INVALID_CHAR_FOUND;
1938                cnv->fromUChar32=sourceChar;
1939                break;
1940            }
1941
1942            if(sourceChar == CR || sourceChar == LF) {
1943                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1944                pFromU2022State->cs[2] = 0;
1945                choiceCount = 0;
1946            }
1947
1948            /* output outLen>0 bytes in buffer[] */
1949            if(outLen == 1) {
1950                *target++ = buffer[0];
1951                if(offsets) {
1952                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1953                }
1954            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1955                *target++ = buffer[0];
1956                *target++ = buffer[1];
1957                if(offsets) {
1958                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1959                    *offsets++ = sourceIndex;
1960                    *offsets++ = sourceIndex;
1961                }
1962            } else {
1963                fromUWriteUInt8(
1964                    cnv,
1965                    buffer, outLen,
1966                    &target, (const char *)targetLimit,
1967                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1968                    err);
1969                if(U_FAILURE(*err)) {
1970                    break;
1971                }
1972            }
1973        } /* end if(myTargetIndex<myTargetLength) */
1974        else{
1975            *err =U_BUFFER_OVERFLOW_ERROR;
1976            break;
1977        }
1978
1979    }/* end while(mySourceIndex<mySourceLength) */
1980
1981    /*
1982     * the end of the input stream and detection of truncated input
1983     * are handled by the framework, but for ISO-2022-JP conversion
1984     * we need to be in ASCII mode at the very end
1985     *
1986     * conditions:
1987     *   successful
1988     *   in SO mode or not in ASCII mode
1989     *   end of input and no truncated input
1990     */
1991    if( U_SUCCESS(*err) &&
1992        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1993        args->flush && source>=sourceLimit && cnv->fromUChar32==0
1994    ) {
1995        int32_t sourceIndex;
1996
1997        outLen = 0;
1998
1999        if(pFromU2022State->g != 0) {
2000            buffer[outLen++] = UCNV_SI;
2001            pFromU2022State->g = 0;
2002        }
2003
2004        if(pFromU2022State->cs[0] != ASCII) {
2005            int32_t escLen = escSeqCharsLen[ASCII];
2006            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2007            outLen += escLen;
2008            pFromU2022State->cs[0] = (int8_t)ASCII;
2009        }
2010
2011        /* get the source index of the last input character */
2012        /*
2013         * TODO this would be simpler and more reliable if we used a pair
2014         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2015         * so that we could simply use the prevSourceIndex here;
2016         * this code gives an incorrect result for the rare case of an unmatched
2017         * trail surrogate that is alone in the last buffer of the text stream
2018         */
2019        sourceIndex=(int32_t)(source-args->source);
2020        if(sourceIndex>0) {
2021            --sourceIndex;
2022            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2023                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2024            ) {
2025                --sourceIndex;
2026            }
2027        } else {
2028            sourceIndex=-1;
2029        }
2030
2031        fromUWriteUInt8(
2032            cnv,
2033            buffer, outLen,
2034            &target, (const char *)targetLimit,
2035            &offsets, sourceIndex,
2036            err);
2037    }
2038
2039    /*save the state and return */
2040    args->source = source;
2041    args->target = (char*)target;
2042}
2043
2044/*************** to unicode *******************/
2045
2046static void
2047UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2048                                               UErrorCode* err){
2049    char tempBuf[2];
2050    const char *mySource = (char *) args->source;
2051    UChar *myTarget = args->target;
2052    const char *mySourceLimit = args->sourceLimit;
2053    uint32_t targetUniChar = 0x0000;
2054    uint32_t mySourceChar = 0x0000;
2055    uint32_t tmpSourceChar = 0x0000;
2056    UConverterDataISO2022* myData;
2057    ISO2022State *pToU2022State;
2058    StateEnum cs;
2059
2060    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2061    pToU2022State = &myData->toU2022State;
2062
2063    if(myData->key != 0) {
2064        /* continue with a partial escape sequence */
2065        goto escape;
2066    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2067        /* continue with a partial double-byte character */
2068        mySourceChar = args->converter->toUBytes[0];
2069        args->converter->toULength = 0;
2070        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2071        targetUniChar = missingCharMarker;
2072        goto getTrailByte;
2073    }
2074
2075    while(mySource < mySourceLimit){
2076
2077        targetUniChar =missingCharMarker;
2078
2079        if(myTarget < args->targetLimit){
2080
2081            mySourceChar= (unsigned char) *mySource++;
2082
2083            switch(mySourceChar) {
2084            case UCNV_SI:
2085                if(myData->version==3) {
2086                    pToU2022State->g=0;
2087                    continue;
2088                } else {
2089                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2090                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2091                    break;
2092                }
2093
2094            case UCNV_SO:
2095                if(myData->version==3) {
2096                    /* JIS7: switch to G1 half-width Katakana */
2097                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2098                    pToU2022State->g=1;
2099                    continue;
2100                } else {
2101                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2102                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2103                    break;
2104                }
2105
2106            case ESC_2022:
2107                mySource--;
2108escape:
2109                {
2110                    const char * mySourceBefore = mySource;
2111                    int8_t toULengthBefore = args->converter->toULength;
2112
2113                    changeState_2022(args->converter,&(mySource),
2114                        mySourceLimit, ISO_2022_JP,err);
2115
2116                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2117                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2118                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2119                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2120                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2121                    }
2122                }
2123
2124                /* invalid or illegal escape sequence */
2125                if(U_FAILURE(*err)){
2126                    args->target = myTarget;
2127                    args->source = mySource;
2128                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
2129                    return;
2130                }
2131                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2132                if(myData->key==0) {
2133                    myData->isEmptySegment = TRUE;
2134                }
2135                continue;
2136
2137            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2138
2139            case CR:
2140                /*falls through*/
2141            case LF:
2142                /* automatically reset to single-byte mode */
2143                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2144                    pToU2022State->cs[0] = (int8_t)ASCII;
2145                }
2146                pToU2022State->cs[2] = 0;
2147                pToU2022State->g = 0;
2148                /* falls through */
2149            default:
2150                /* convert one or two bytes */
2151                myData->isEmptySegment = FALSE;
2152                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2153                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2154                    !IS_JP_DBCS(cs)
2155                ) {
2156                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2157                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2158
2159                    /* return from a single-shift state to the previous one */
2160                    if(pToU2022State->g >= 2) {
2161                        pToU2022State->g=pToU2022State->prevG;
2162                    }
2163                } else switch(cs) {
2164                case ASCII:
2165                    if(mySourceChar <= 0x7f) {
2166                        targetUniChar = mySourceChar;
2167                    }
2168                    break;
2169                case ISO8859_1:
2170                    if(mySourceChar <= 0x7f) {
2171                        targetUniChar = mySourceChar + 0x80;
2172                    }
2173                    /* return from a single-shift state to the previous one */
2174                    pToU2022State->g=pToU2022State->prevG;
2175                    break;
2176                case ISO8859_7:
2177                    if(mySourceChar <= 0x7f) {
2178                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2179                        targetUniChar =
2180                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2181                                myData->myConverterArray[cs],
2182                                mySourceChar + 0x80);
2183                    }
2184                    /* return from a single-shift state to the previous one */
2185                    pToU2022State->g=pToU2022State->prevG;
2186                    break;
2187                case JISX201:
2188                    if(mySourceChar <= 0x7f) {
2189                        targetUniChar = jisx201ToU(mySourceChar);
2190                    }
2191                    break;
2192                case HWKANA_7BIT:
2193                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2194                        /* 7-bit halfwidth Katakana */
2195                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2196                    }
2197                    break;
2198                default:
2199                    /* G0 DBCS */
2200                    if(mySource < mySourceLimit) {
2201                        int leadIsOk, trailIsOk;
2202                        uint8_t trailByte;
2203getTrailByte:
2204                        trailByte = (uint8_t)*mySource;
2205                        /*
2206                         * Ticket 5691: consistent illegal sequences:
2207                         * - We include at least the first byte in the illegal sequence.
2208                         * - If any of the non-initial bytes could be the start of a character,
2209                         *   we stop the illegal sequence before the first one of those.
2210                         *
2211                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2212                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2213                         * Otherwise we convert or report the pair of bytes.
2214                         */
2215                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2216                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2217                        if (leadIsOk && trailIsOk) {
2218                            ++mySource;
2219                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2220                            if(cs == JISX208) {
2221                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2222                                mySourceChar = tmpSourceChar;
2223                            } else {
2224                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2225                                mySourceChar = tmpSourceChar;
2226                                if (cs == KSC5601) {
2227                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2228                                }
2229                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2230                                tempBuf[1] = (char)(tmpSourceChar);
2231                            }
2232                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2233                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2234                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2235                            ++mySource;
2236                            /* add another bit so that the code below writes 2 bytes in case of error */
2237                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2238                        }
2239                    } else {
2240                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2241                        args->converter->toULength = 1;
2242                        goto endloop;
2243                    }
2244                }  /* End of inner switch */
2245                break;
2246            }  /* End of outer switch */
2247            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2248                if(args->offsets){
2249                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2250                }
2251                *(myTarget++)=(UChar)targetUniChar;
2252            }
2253            else if(targetUniChar > missingCharMarker){
2254                /* disassemble the surrogate pair and write to output*/
2255                targetUniChar-=0x0010000;
2256                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2257                if(args->offsets){
2258                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2259                }
2260                ++myTarget;
2261                if(myTarget< args->targetLimit){
2262                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2263                    if(args->offsets){
2264                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2265                    }
2266                    ++myTarget;
2267                }else{
2268                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2269                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2270                }
2271
2272            }
2273            else{
2274                /* Call the callback function*/
2275                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2276                break;
2277            }
2278        }
2279        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2280            *err =U_BUFFER_OVERFLOW_ERROR;
2281            break;
2282        }
2283    }
2284endloop:
2285    args->target = myTarget;
2286    args->source = mySource;
2287}
2288
2289
2290/***************************************************************
2291*   Rules for ISO-2022-KR encoding
2292*   i) The KSC5601 designator sequence should appear only once in a file,
2293*      at the begining of a line before any KSC5601 characters. This usually
2294*      means that it appears by itself on the first line of the file
2295*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2296*      and SI to shift into single byte mode
2297*/
2298static void
2299UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2300
2301    UConverter* saveConv = args->converter;
2302    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2303    args->converter=myConverterData->currentConverter;
2304
2305    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2306    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2307    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2308
2309    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2310        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2311            uprv_memcpy(
2312                saveConv->charErrorBuffer,
2313                myConverterData->currentConverter->charErrorBuffer,
2314                myConverterData->currentConverter->charErrorBufferLength);
2315        }
2316        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2317        myConverterData->currentConverter->charErrorBufferLength = 0;
2318    }
2319    args->converter=saveConv;
2320}
2321
2322static void
2323UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2324
2325    const UChar *source = args->source;
2326    const UChar *sourceLimit = args->sourceLimit;
2327    unsigned char *target = (unsigned char *) args->target;
2328    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2329    int32_t* offsets = args->offsets;
2330    uint32_t targetByteUnit = 0x0000;
2331    UChar32 sourceChar = 0x0000;
2332    UBool isTargetByteDBCS;
2333    UBool oldIsTargetByteDBCS;
2334    UConverterDataISO2022 *converterData;
2335    UConverterSharedData* sharedData;
2336    UBool useFallback;
2337    int32_t length =0;
2338
2339    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2340    /* if the version is 1 then the user is requesting
2341     * conversion with ibm-25546 pass the arguments to
2342     * MBCS converter and return
2343     */
2344    if(converterData->version==1){
2345        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2346        return;
2347    }
2348
2349    /* initialize data */
2350    sharedData = converterData->currentConverter->sharedData;
2351    useFallback = args->converter->useFallback;
2352    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2353    oldIsTargetByteDBCS = isTargetByteDBCS;
2354
2355    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2356    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2357        goto getTrail;
2358    }
2359    while(source < sourceLimit){
2360
2361        targetByteUnit = missingCharMarker;
2362
2363        if(target < (unsigned char*) args->targetLimit){
2364            sourceChar = *source++;
2365
2366            /* do not convert SO/SI/ESC */
2367            if(IS_2022_CONTROL(sourceChar)) {
2368                /* callback(illegal) */
2369                *err=U_ILLEGAL_CHAR_FOUND;
2370                args->converter->fromUChar32=sourceChar;
2371                break;
2372            }
2373
2374            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2375            if(length < 0) {
2376                length = -length;  /* fallback */
2377            }
2378            /* only DBCS or SBCS characters are expected*/
2379            /* DB characters with high bit set to 1 are expected */
2380            if( length > 2 || length==0 ||
2381                (length == 1 && targetByteUnit > 0x7f) ||
2382                (length == 2 &&
2383                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2384                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2385            ) {
2386                targetByteUnit=missingCharMarker;
2387            }
2388            if (targetByteUnit != missingCharMarker){
2389
2390                oldIsTargetByteDBCS = isTargetByteDBCS;
2391                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2392                  /* append the shift sequence */
2393                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2394
2395                    if (isTargetByteDBCS)
2396                        *target++ = UCNV_SO;
2397                    else
2398                        *target++ = UCNV_SI;
2399                    if(offsets)
2400                        *(offsets++) = (int32_t)(source - args->source-1);
2401                }
2402                /* write the targetUniChar  to target */
2403                if(targetByteUnit <= 0x00FF){
2404                    if( target < targetLimit){
2405                        *(target++) = (unsigned char) targetByteUnit;
2406                        if(offsets){
2407                            *(offsets++) = (int32_t)(source - args->source-1);
2408                        }
2409
2410                    }else{
2411                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2412                        *err = U_BUFFER_OVERFLOW_ERROR;
2413                    }
2414                }else{
2415                    if(target < targetLimit){
2416                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2417                        if(offsets){
2418                            *(offsets++) = (int32_t)(source - args->source-1);
2419                        }
2420                        if(target < targetLimit){
2421                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2422                            if(offsets){
2423                                *(offsets++) = (int32_t)(source - args->source-1);
2424                            }
2425                        }else{
2426                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2427                            *err = U_BUFFER_OVERFLOW_ERROR;
2428                        }
2429                    }else{
2430                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2431                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2432                        *err = U_BUFFER_OVERFLOW_ERROR;
2433                    }
2434                }
2435
2436            }
2437            else{
2438                /* oops.. the code point is unassingned
2439                 * set the error and reason
2440                 */
2441
2442                /*check if the char is a First surrogate*/
2443                if(UTF_IS_SURROGATE(sourceChar)) {
2444                    if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2445getTrail:
2446                        /*look ahead to find the trail surrogate*/
2447                        if(source <  sourceLimit) {
2448                            /* test the following code unit */
2449                            UChar trail=(UChar) *source;
2450                            if(UTF_IS_SECOND_SURROGATE(trail)) {
2451                                source++;
2452                                sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2453                                *err = U_INVALID_CHAR_FOUND;
2454                                /* convert this surrogate code point */
2455                                /* exit this condition tree */
2456                            } else {
2457                                /* this is an unmatched lead code unit (1st surrogate) */
2458                                /* callback(illegal) */
2459                                *err=U_ILLEGAL_CHAR_FOUND;
2460                            }
2461                        } else {
2462                            /* no more input */
2463                            *err = U_ZERO_ERROR;
2464                        }
2465                    } else {
2466                        /* this is an unmatched trail code unit (2nd surrogate) */
2467                        /* callback(illegal) */
2468                        *err=U_ILLEGAL_CHAR_FOUND;
2469                    }
2470                } else {
2471                    /* callback(unassigned) for a BMP code point */
2472                    *err = U_INVALID_CHAR_FOUND;
2473                }
2474
2475                args->converter->fromUChar32=sourceChar;
2476                break;
2477            }
2478        } /* end if(myTargetIndex<myTargetLength) */
2479        else{
2480            *err =U_BUFFER_OVERFLOW_ERROR;
2481            break;
2482        }
2483
2484    }/* end while(mySourceIndex<mySourceLength) */
2485
2486    /*
2487     * the end of the input stream and detection of truncated input
2488     * are handled by the framework, but for ISO-2022-KR conversion
2489     * we need to be in ASCII mode at the very end
2490     *
2491     * conditions:
2492     *   successful
2493     *   not in ASCII mode
2494     *   end of input and no truncated input
2495     */
2496    if( U_SUCCESS(*err) &&
2497        isTargetByteDBCS &&
2498        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2499    ) {
2500        int32_t sourceIndex;
2501
2502        /* we are switching to ASCII */
2503        isTargetByteDBCS=FALSE;
2504
2505        /* get the source index of the last input character */
2506        /*
2507         * TODO this would be simpler and more reliable if we used a pair
2508         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2509         * so that we could simply use the prevSourceIndex here;
2510         * this code gives an incorrect result for the rare case of an unmatched
2511         * trail surrogate that is alone in the last buffer of the text stream
2512         */
2513        sourceIndex=(int32_t)(source-args->source);
2514        if(sourceIndex>0) {
2515            --sourceIndex;
2516            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2517                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2518            ) {
2519                --sourceIndex;
2520            }
2521        } else {
2522            sourceIndex=-1;
2523        }
2524
2525        fromUWriteUInt8(
2526            args->converter,
2527            SHIFT_IN_STR, 1,
2528            &target, (const char *)targetLimit,
2529            &offsets, sourceIndex,
2530            err);
2531    }
2532
2533    /*save the state and return */
2534    args->source = source;
2535    args->target = (char*)target;
2536    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2537}
2538
2539/************************ To Unicode ***************************************/
2540
2541static void
2542UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2543                                                            UErrorCode* err){
2544    char const* sourceStart;
2545    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2546
2547    UConverterToUnicodeArgs subArgs;
2548    int32_t minArgsSize;
2549
2550    /* set up the subconverter arguments */
2551    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2552        minArgsSize = args->size;
2553    } else {
2554        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2555    }
2556
2557    uprv_memcpy(&subArgs, args, minArgsSize);
2558    subArgs.size = (uint16_t)minArgsSize;
2559    subArgs.converter = myData->currentConverter;
2560
2561    /* remember the original start of the input for offsets */
2562    sourceStart = args->source;
2563
2564    if(myData->key != 0) {
2565        /* continue with a partial escape sequence */
2566        goto escape;
2567    }
2568
2569    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2570        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2571        subArgs.source = args->source;
2572        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2573        if(subArgs.source != subArgs.sourceLimit) {
2574            /*
2575             * get the current partial byte sequence
2576             *
2577             * it needs to be moved between the public and the subconverter
2578             * so that the conversion framework, which only sees the public
2579             * converter, can handle truncated and illegal input etc.
2580             */
2581            if(args->converter->toULength > 0) {
2582                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2583            }
2584            subArgs.converter->toULength = args->converter->toULength;
2585
2586            /*
2587             * Convert up to the end of the input, or to before the next escape character.
2588             * Does not handle conversion extensions because the preToU[] state etc.
2589             * is not copied.
2590             */
2591            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2592
2593            if(args->offsets != NULL && sourceStart != args->source) {
2594                /* update offsets to base them on the actual start of the input */
2595                int32_t *offsets = args->offsets;
2596                UChar *target = args->target;
2597                int32_t delta = (int32_t)(args->source - sourceStart);
2598                while(target < subArgs.target) {
2599                    if(*offsets >= 0) {
2600                        *offsets += delta;
2601                    }
2602                    ++offsets;
2603                    ++target;
2604                }
2605            }
2606            args->source = subArgs.source;
2607            args->target = subArgs.target;
2608            args->offsets = subArgs.offsets;
2609
2610            /* copy input/error/overflow buffers */
2611            if(subArgs.converter->toULength > 0) {
2612                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2613            }
2614            args->converter->toULength = subArgs.converter->toULength;
2615
2616            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2617                if(subArgs.converter->UCharErrorBufferLength > 0) {
2618                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2619                                subArgs.converter->UCharErrorBufferLength);
2620                }
2621                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2622                subArgs.converter->UCharErrorBufferLength = 0;
2623            }
2624        }
2625
2626        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2627            return;
2628        }
2629
2630escape:
2631        changeState_2022(args->converter,
2632               &(args->source),
2633               args->sourceLimit,
2634               ISO_2022_KR,
2635               err);
2636    }
2637}
2638
2639static void
2640UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2641                                                            UErrorCode* err){
2642    char tempBuf[2];
2643    const char *mySource = ( char *) args->source;
2644    UChar *myTarget = args->target;
2645    const char *mySourceLimit = args->sourceLimit;
2646    UChar32 targetUniChar = 0x0000;
2647    UChar mySourceChar = 0x0000;
2648    UConverterDataISO2022* myData;
2649    UConverterSharedData* sharedData ;
2650    UBool useFallback;
2651
2652    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2653    if(myData->version==1){
2654        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2655        return;
2656    }
2657
2658    /* initialize state */
2659    sharedData = myData->currentConverter->sharedData;
2660    useFallback = args->converter->useFallback;
2661
2662    if(myData->key != 0) {
2663        /* continue with a partial escape sequence */
2664        goto escape;
2665    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2666        /* continue with a partial double-byte character */
2667        mySourceChar = args->converter->toUBytes[0];
2668        args->converter->toULength = 0;
2669        goto getTrailByte;
2670    }
2671
2672    while(mySource< mySourceLimit){
2673
2674        if(myTarget < args->targetLimit){
2675
2676            mySourceChar= (unsigned char) *mySource++;
2677
2678            if(mySourceChar==UCNV_SI){
2679                myData->toU2022State.g = 0;
2680                if (myData->isEmptySegment) {
2681                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
2682                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2683                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2684                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2685                    args->converter->toULength = 1;
2686                    args->target = myTarget;
2687                    args->source = mySource;
2688                    return;
2689                }
2690                /*consume the source */
2691                continue;
2692            }else if(mySourceChar==UCNV_SO){
2693                myData->toU2022State.g = 1;
2694                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
2695                /*consume the source */
2696                continue;
2697            }else if(mySourceChar==ESC_2022){
2698                mySource--;
2699escape:
2700                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2701                changeState_2022(args->converter,&(mySource),
2702                                mySourceLimit, ISO_2022_KR, err);
2703                if(U_FAILURE(*err)){
2704                    args->target = myTarget;
2705                    args->source = mySource;
2706                    return;
2707                }
2708                continue;
2709            }
2710
2711            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
2712            if(myData->toU2022State.g == 1) {
2713                if(mySource < mySourceLimit) {
2714                    int leadIsOk, trailIsOk;
2715                    uint8_t trailByte;
2716getTrailByte:
2717                    targetUniChar = missingCharMarker;
2718                    trailByte = (uint8_t)*mySource;
2719                    /*
2720                     * Ticket 5691: consistent illegal sequences:
2721                     * - We include at least the first byte in the illegal sequence.
2722                     * - If any of the non-initial bytes could be the start of a character,
2723                     *   we stop the illegal sequence before the first one of those.
2724                     *
2725                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2726                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2727                     * Otherwise we convert or report the pair of bytes.
2728                     */
2729                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2730                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2731                    if (leadIsOk && trailIsOk) {
2732                        ++mySource;
2733                        tempBuf[0] = (char)(mySourceChar + 0x80);
2734                        tempBuf[1] = (char)(trailByte + 0x80);
2735                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2736                        mySourceChar = (mySourceChar << 8) | trailByte;
2737                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2738                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2739                        ++mySource;
2740                        /* add another bit so that the code below writes 2 bytes in case of error */
2741                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2742                    }
2743                } else {
2744                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2745                    args->converter->toULength = 1;
2746                    break;
2747                }
2748            }
2749            else if(mySourceChar <= 0x7f) {
2750                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2751            } else {
2752                targetUniChar = 0xffff;
2753            }
2754            if(targetUniChar < 0xfffe){
2755                if(args->offsets) {
2756                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2757                }
2758                *(myTarget++)=(UChar)targetUniChar;
2759            }
2760            else {
2761                /* Call the callback function*/
2762                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2763                break;
2764            }
2765        }
2766        else{
2767            *err =U_BUFFER_OVERFLOW_ERROR;
2768            break;
2769        }
2770    }
2771    args->target = myTarget;
2772    args->source = mySource;
2773}
2774
2775/*************************** END ISO2022-KR *********************************/
2776
2777/*************************** ISO-2022-CN *********************************
2778*
2779* Rules for ISO-2022-CN Encoding:
2780* i)   The designator sequence must appear once on a line before any instance
2781*      of character set it designates.
2782* ii)  If two lines contain characters from the same character set, both lines
2783*      must include the designator sequence.
2784* iii) Once the designator sequence is known, a shifting sequence has to be found
2785*      to invoke the  shifting
2786* iv)  All lines start in ASCII and end in ASCII.
2787* v)   Four shifting sequences are employed for this purpose:
2788*
2789*      Sequcence   ASCII Eq    Charsets
2790*      ----------  -------    ---------
2791*      SI           <SI>        US-ASCII
2792*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2793*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2794*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2795*
2796* vi)
2797*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2798*      SS2designator : ESC "$" "*" finalchar_for_SS2
2799*      SS3designator : ESC "$" "+" finalchar_for_SS3
2800*
2801*      ESC $ ) A       Indicates the bytes following SO are Chinese
2802*       characters as defined in GB 2312-80, until
2803*       another SOdesignation appears
2804*
2805*
2806*      ESC $ ) E       Indicates the bytes following SO are as defined
2807*       in ISO-IR-165 (for details, see section 2.1),
2808*       until another SOdesignation appears
2809*
2810*      ESC $ ) G       Indicates the bytes following SO are as defined
2811*       in CNS 11643-plane-1, until another
2812*       SOdesignation appears
2813*
2814*      ESC $ * H       Indicates the two bytes immediately following
2815*       SS2 is a Chinese character as defined in CNS
2816*       11643-plane-2, until another SS2designation
2817*       appears
2818*       (Meaning <ESC>N must preceed every 2 byte
2819*        sequence.)
2820*
2821*      ESC $ + I       Indicates the immediate two bytes following SS3
2822*       is a Chinese character as defined in CNS
2823*       11643-plane-3, until another SS3designation
2824*       appears
2825*       (Meaning <ESC>O must preceed every 2 byte
2826*        sequence.)
2827*
2828*      ESC $ + J       Indicates the immediate two bytes following SS3
2829*       is a Chinese character as defined in CNS
2830*       11643-plane-4, until another SS3designation
2831*       appears
2832*       (In English: <ESC>O must preceed every 2 byte
2833*        sequence.)
2834*
2835*      ESC $ + K       Indicates the immediate two bytes following SS3
2836*       is a Chinese character as defined in CNS
2837*       11643-plane-5, until another SS3designation
2838*       appears
2839*
2840*      ESC $ + L       Indicates the immediate two bytes following SS3
2841*       is a Chinese character as defined in CNS
2842*       11643-plane-6, until another SS3designation
2843*       appears
2844*
2845*      ESC $ + M       Indicates the immediate two bytes following SS3
2846*       is a Chinese character as defined in CNS
2847*       11643-plane-7, until another SS3designation
2848*       appears
2849*
2850*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2851*       has its own designation information before any Chinese characters
2852*       appear
2853*
2854*/
2855
2856/* The following are defined this way to make the strings truely readonly */
2857static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2858static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2859static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2860static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2861static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2862static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2863static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2864static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2865static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2866
2867/********************** ISO2022-CN Data **************************/
2868static const char* const escSeqCharsCN[10] ={
2869        SHIFT_IN_STR,           /* ASCII */
2870        GB_2312_80_STR,
2871        ISO_IR_165_STR,
2872        CNS_11643_1992_Plane_1_STR,
2873        CNS_11643_1992_Plane_2_STR,
2874        CNS_11643_1992_Plane_3_STR,
2875        CNS_11643_1992_Plane_4_STR,
2876        CNS_11643_1992_Plane_5_STR,
2877        CNS_11643_1992_Plane_6_STR,
2878        CNS_11643_1992_Plane_7_STR
2879};
2880
2881static void
2882UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2883    UConverter *cnv = args->converter;
2884    UConverterDataISO2022 *converterData;
2885    ISO2022State *pFromU2022State;
2886    uint8_t *target = (uint8_t *) args->target;
2887    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2888    const UChar* source = args->source;
2889    const UChar* sourceLimit = args->sourceLimit;
2890    int32_t* offsets = args->offsets;
2891    UChar32 sourceChar;
2892    char buffer[8];
2893    int32_t len;
2894    int8_t choices[3];
2895    int32_t choiceCount;
2896    uint32_t targetValue = 0;
2897    UBool useFallback;
2898
2899    /* set up the state */
2900    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2901    pFromU2022State   = &converterData->fromU2022State;
2902
2903    choiceCount = 0;
2904
2905    /* check if the last codepoint of previous buffer was a lead surrogate*/
2906    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2907        goto getTrail;
2908    }
2909
2910    while( source < sourceLimit){
2911        if(target < targetLimit){
2912
2913            sourceChar  = *(source++);
2914            /*check if the char is a First surrogate*/
2915             if(UTF_IS_SURROGATE(sourceChar)) {
2916                if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2917getTrail:
2918                    /*look ahead to find the trail surrogate*/
2919                    if(source < sourceLimit) {
2920                        /* test the following code unit */
2921                        UChar trail=(UChar) *source;
2922                        if(UTF_IS_SECOND_SURROGATE(trail)) {
2923                            source++;
2924                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2925                            cnv->fromUChar32=0x00;
2926                            /* convert this supplementary code point */
2927                            /* exit this condition tree */
2928                        } else {
2929                            /* this is an unmatched lead code unit (1st surrogate) */
2930                            /* callback(illegal) */
2931                            *err=U_ILLEGAL_CHAR_FOUND;
2932                            cnv->fromUChar32=sourceChar;
2933                            break;
2934                        }
2935                    } else {
2936                        /* no more input */
2937                        cnv->fromUChar32=sourceChar;
2938                        break;
2939                    }
2940                } else {
2941                    /* this is an unmatched trail code unit (2nd surrogate) */
2942                    /* callback(illegal) */
2943                    *err=U_ILLEGAL_CHAR_FOUND;
2944                    cnv->fromUChar32=sourceChar;
2945                    break;
2946                }
2947            }
2948
2949            /* do the conversion */
2950            if(sourceChar <= 0x007f ){
2951                /* do not convert SO/SI/ESC */
2952                if(IS_2022_CONTROL(sourceChar)) {
2953                    /* callback(illegal) */
2954                    *err=U_ILLEGAL_CHAR_FOUND;
2955                    cnv->fromUChar32=sourceChar;
2956                    break;
2957                }
2958
2959                /* US-ASCII */
2960                if(pFromU2022State->g == 0) {
2961                    buffer[0] = (char)sourceChar;
2962                    len = 1;
2963                } else {
2964                    buffer[0] = UCNV_SI;
2965                    buffer[1] = (char)sourceChar;
2966                    len = 2;
2967                    pFromU2022State->g = 0;
2968                    choiceCount = 0;
2969                }
2970                if(sourceChar == CR || sourceChar == LF) {
2971                    /* reset the state at the end of a line */
2972                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2973                    choiceCount = 0;
2974                }
2975            }
2976            else{
2977                /* convert U+0080..U+10ffff */
2978                int32_t i;
2979                int8_t cs, g;
2980
2981                if(choiceCount == 0) {
2982                    /* try the current SO/G1 converter first */
2983                    choices[0] = pFromU2022State->cs[1];
2984
2985                    /* default to GB2312_1 if none is designated yet */
2986                    if(choices[0] == 0) {
2987                        choices[0] = GB2312_1;
2988                    }
2989
2990                    if(converterData->version == 0) {
2991                        /* ISO-2022-CN */
2992
2993                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2994                        if(choices[0] == GB2312_1) {
2995                            choices[1] = (int8_t)CNS_11643_1;
2996                        } else {
2997                            choices[1] = (int8_t)GB2312_1;
2998                        }
2999
3000                        choiceCount = 2;
3001                    } else if (converterData->version == 1) {
3002                        /* ISO-2022-CN-EXT */
3003
3004                        /* try one of the other converters */
3005                        switch(choices[0]) {
3006                        case GB2312_1:
3007                            choices[1] = (int8_t)CNS_11643_1;
3008                            choices[2] = (int8_t)ISO_IR_165;
3009                            break;
3010                        case ISO_IR_165:
3011                            choices[1] = (int8_t)GB2312_1;
3012                            choices[2] = (int8_t)CNS_11643_1;
3013                            break;
3014                        default: /* CNS_11643_x */
3015                            choices[1] = (int8_t)GB2312_1;
3016                            choices[2] = (int8_t)ISO_IR_165;
3017                            break;
3018                        }
3019
3020                        choiceCount = 3;
3021                    } else {
3022                        choices[0] = (int8_t)CNS_11643_1;
3023                        choices[1] = (int8_t)GB2312_1;
3024                    }
3025                }
3026
3027                cs = g = 0;
3028                /*
3029                 * len==0: no mapping found yet
3030                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3031                 * len>0: found a roundtrip result, done
3032                 */
3033                len = 0;
3034                /*
3035                 * We will turn off useFallback after finding a fallback,
3036                 * but we still get fallbacks from PUA code points as usual.
3037                 * Therefore, we will also need to check that we don't overwrite
3038                 * an early fallback with a later one.
3039                 */
3040                useFallback = cnv->useFallback;
3041
3042                for(i = 0; i < choiceCount && len <= 0; ++i) {
3043                    int8_t cs0 = choices[i];
3044                    if(cs0 > 0) {
3045                        uint32_t value;
3046                        int32_t len2;
3047                        if(cs0 >= CNS_11643_0) {
3048                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3049                                        converterData->myConverterArray[CNS_11643],
3050                                        sourceChar,
3051                                        &value,
3052                                        useFallback,
3053                                        MBCS_OUTPUT_3);
3054                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3055                                targetValue = value;
3056                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3057                                if(len2 >= 0) {
3058                                    len = 2;
3059                                } else {
3060                                    len = -2;
3061                                    useFallback = FALSE;
3062                                }
3063                                if(cs == CNS_11643_1) {
3064                                    g = 1;
3065                                } else if(cs == CNS_11643_2) {
3066                                    g = 2;
3067                                } else /* plane 3..7 */ if(converterData->version == 1) {
3068                                    g = 3;
3069                                } else {
3070                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3071                                    len = 0;
3072                                }
3073                            }
3074                        } else {
3075                            /* GB2312_1 or ISO-IR-165 */
3076                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3077                                        converterData->myConverterArray[cs0],
3078                                        sourceChar,
3079                                        &value,
3080                                        useFallback,
3081                                        MBCS_OUTPUT_2);
3082                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3083                                targetValue = value;
3084                                len = len2;
3085                                cs = cs0;
3086                                g = 1;
3087                                useFallback = FALSE;
3088                            }
3089                        }
3090                    }
3091                }
3092
3093                if(len != 0) {
3094                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3095
3096                    /* write the designation sequence if necessary */
3097                    if(cs != pFromU2022State->cs[g]) {
3098                        if(cs < CNS_11643) {
3099                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3100                        } else {
3101                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3102                        }
3103                        len = 4;
3104                        pFromU2022State->cs[g] = cs;
3105                        if(g == 1) {
3106                            /* changing the SO/G1 charset invalidates the choices[] */
3107                            choiceCount = 0;
3108                        }
3109                    }
3110
3111                    /* write the shift sequence if necessary */
3112                    if(g != pFromU2022State->g) {
3113                        switch(g) {
3114                        case 1:
3115                            buffer[len++] = UCNV_SO;
3116
3117                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3118                            pFromU2022State->g = 1;
3119                            break;
3120                        case 2:
3121                            buffer[len++] = 0x1b;
3122                            buffer[len++] = 0x4e;
3123                            break;
3124                        default: /* case 3 */
3125                            buffer[len++] = 0x1b;
3126                            buffer[len++] = 0x4f;
3127                            break;
3128                        }
3129                    }
3130
3131                    /* write the two output bytes */
3132                    buffer[len++] = (char)(targetValue >> 8);
3133                    buffer[len++] = (char)targetValue;
3134                } else {
3135                    /* if we cannot find the character after checking all codepages
3136                     * then this is an error
3137                     */
3138                    *err = U_INVALID_CHAR_FOUND;
3139                    cnv->fromUChar32=sourceChar;
3140                    break;
3141                }
3142            }
3143
3144            /* output len>0 bytes in buffer[] */
3145            if(len == 1) {
3146                *target++ = buffer[0];
3147                if(offsets) {
3148                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3149                }
3150            } else if(len == 2 && (target + 2) <= targetLimit) {
3151                *target++ = buffer[0];
3152                *target++ = buffer[1];
3153                if(offsets) {
3154                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3155                    *offsets++ = sourceIndex;
3156                    *offsets++ = sourceIndex;
3157                }
3158            } else {
3159                fromUWriteUInt8(
3160                    cnv,
3161                    buffer, len,
3162                    &target, (const char *)targetLimit,
3163                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3164                    err);
3165                if(U_FAILURE(*err)) {
3166                    break;
3167                }
3168            }
3169        } /* end if(myTargetIndex<myTargetLength) */
3170        else{
3171            *err =U_BUFFER_OVERFLOW_ERROR;
3172            break;
3173        }
3174
3175    }/* end while(mySourceIndex<mySourceLength) */
3176
3177    /*
3178     * the end of the input stream and detection of truncated input
3179     * are handled by the framework, but for ISO-2022-CN conversion
3180     * we need to be in ASCII mode at the very end
3181     *
3182     * conditions:
3183     *   successful
3184     *   not in ASCII mode
3185     *   end of input and no truncated input
3186     */
3187    if( U_SUCCESS(*err) &&
3188        pFromU2022State->g!=0 &&
3189        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3190    ) {
3191        int32_t sourceIndex;
3192
3193        /* we are switching to ASCII */
3194        pFromU2022State->g=0;
3195
3196        /* get the source index of the last input character */
3197        /*
3198         * TODO this would be simpler and more reliable if we used a pair
3199         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3200         * so that we could simply use the prevSourceIndex here;
3201         * this code gives an incorrect result for the rare case of an unmatched
3202         * trail surrogate that is alone in the last buffer of the text stream
3203         */
3204        sourceIndex=(int32_t)(source-args->source);
3205        if(sourceIndex>0) {
3206            --sourceIndex;
3207            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3208                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3209            ) {
3210                --sourceIndex;
3211            }
3212        } else {
3213            sourceIndex=-1;
3214        }
3215
3216        fromUWriteUInt8(
3217            cnv,
3218            SHIFT_IN_STR, 1,
3219            &target, (const char *)targetLimit,
3220            &offsets, sourceIndex,
3221            err);
3222    }
3223
3224    /*save the state and return */
3225    args->source = source;
3226    args->target = (char*)target;
3227}
3228
3229
3230static void
3231UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3232                                               UErrorCode* err){
3233    char tempBuf[3];
3234    const char *mySource = (char *) args->source;
3235    UChar *myTarget = args->target;
3236    const char *mySourceLimit = args->sourceLimit;
3237    uint32_t targetUniChar = 0x0000;
3238    uint32_t mySourceChar = 0x0000;
3239    UConverterDataISO2022* myData;
3240    ISO2022State *pToU2022State;
3241
3242    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3243    pToU2022State = &myData->toU2022State;
3244
3245    if(myData->key != 0) {
3246        /* continue with a partial escape sequence */
3247        goto escape;
3248    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3249        /* continue with a partial double-byte character */
3250        mySourceChar = args->converter->toUBytes[0];
3251        args->converter->toULength = 0;
3252        targetUniChar = missingCharMarker;
3253        goto getTrailByte;
3254    }
3255
3256    while(mySource < mySourceLimit){
3257
3258        targetUniChar =missingCharMarker;
3259
3260        if(myTarget < args->targetLimit){
3261
3262            mySourceChar= (unsigned char) *mySource++;
3263
3264            switch(mySourceChar){
3265            case UCNV_SI:
3266                pToU2022State->g=0;
3267                if (myData->isEmptySegment) {
3268                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
3269                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3270                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3271                    args->converter->toUBytes[0] = mySourceChar;
3272                    args->converter->toULength = 1;
3273                    args->target = myTarget;
3274                    args->source = mySource;
3275                    return;
3276                }
3277                continue;
3278
3279            case UCNV_SO:
3280                if(pToU2022State->cs[1] != 0) {
3281                    pToU2022State->g=1;
3282                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
3283                    continue;
3284                } else {
3285                    /* illegal to have SO before a matching designator */
3286                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
3287                    break;
3288                }
3289
3290            case ESC_2022:
3291                mySource--;
3292escape:
3293                {
3294                    const char * mySourceBefore = mySource;
3295                    int8_t toULengthBefore = args->converter->toULength;
3296
3297                    changeState_2022(args->converter,&(mySource),
3298                        mySourceLimit, ISO_2022_CN,err);
3299
3300                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3301                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3302                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3303                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3304                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3305                    }
3306                }
3307
3308                /* invalid or illegal escape sequence */
3309                if(U_FAILURE(*err)){
3310                    args->target = myTarget;
3311                    args->source = mySource;
3312                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
3313                    return;
3314                }
3315                continue;
3316
3317            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3318
3319            case CR:
3320                /*falls through*/
3321            case LF:
3322                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3323                /* falls through */
3324            default:
3325                /* convert one or two bytes */
3326                myData->isEmptySegment = FALSE;
3327                if(pToU2022State->g != 0) {
3328                    if(mySource < mySourceLimit) {
3329                        UConverterSharedData *cnv;
3330                        StateEnum tempState;
3331                        int32_t tempBufLen;
3332                        int leadIsOk, trailIsOk;
3333                        uint8_t trailByte;
3334getTrailByte:
3335                        trailByte = (uint8_t)*mySource;
3336                        /*
3337                         * Ticket 5691: consistent illegal sequences:
3338                         * - We include at least the first byte in the illegal sequence.
3339                         * - If any of the non-initial bytes could be the start of a character,
3340                         *   we stop the illegal sequence before the first one of those.
3341                         *
3342                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3343                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3344                         * Otherwise we convert or report the pair of bytes.
3345                         */
3346                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3347                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3348                        if (leadIsOk && trailIsOk) {
3349                            ++mySource;
3350                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3351                            if(tempState >= CNS_11643_0) {
3352                                cnv = myData->myConverterArray[CNS_11643];
3353                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3354                                tempBuf[1] = (char) (mySourceChar);
3355                                tempBuf[2] = (char) trailByte;
3356                                tempBufLen = 3;
3357
3358                            }else{
3359                                cnv = myData->myConverterArray[tempState];
3360                                tempBuf[0] = (char) (mySourceChar);
3361                                tempBuf[1] = (char) trailByte;
3362                                tempBufLen = 2;
3363                            }
3364                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3365                            mySourceChar = (mySourceChar << 8) | trailByte;
3366                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3367                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3368                            ++mySource;
3369                            /* add another bit so that the code below writes 2 bytes in case of error */
3370                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3371                        }
3372                        if(pToU2022State->g>=2) {
3373                            /* return from a single-shift state to the previous one */
3374                            pToU2022State->g=pToU2022State->prevG;
3375                        }
3376                    } else {
3377                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3378                        args->converter->toULength = 1;
3379                        goto endloop;
3380                    }
3381                }
3382                else{
3383                    if(mySourceChar <= 0x7f) {
3384                        targetUniChar = (UChar) mySourceChar;
3385                    }
3386                }
3387                break;
3388            }
3389            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3390                if(args->offsets){
3391                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3392                }
3393                *(myTarget++)=(UChar)targetUniChar;
3394            }
3395            else if(targetUniChar > missingCharMarker){
3396                /* disassemble the surrogate pair and write to output*/
3397                targetUniChar-=0x0010000;
3398                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3399                if(args->offsets){
3400                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3401                }
3402                ++myTarget;
3403                if(myTarget< args->targetLimit){
3404                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3405                    if(args->offsets){
3406                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3407                    }
3408                    ++myTarget;
3409                }else{
3410                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3411                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3412                }
3413
3414            }
3415            else{
3416                /* Call the callback function*/
3417                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3418                break;
3419            }
3420        }
3421        else{
3422            *err =U_BUFFER_OVERFLOW_ERROR;
3423            break;
3424        }
3425    }
3426endloop:
3427    args->target = myTarget;
3428    args->source = mySource;
3429}
3430
3431static void
3432_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3433    UConverter *cnv = args->converter;
3434    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3435    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3436    char *p, *subchar;
3437    char buffer[8];
3438    int32_t length;
3439
3440    subchar=(char *)cnv->subChars;
3441    length=cnv->subCharLen; /* assume length==1 for most variants */
3442
3443    p = buffer;
3444    switch(myConverterData->locale[0]){
3445    case 'j':
3446        {
3447            int8_t cs;
3448
3449            if(pFromU2022State->g == 1) {
3450                /* JIS7: switch from G1 to G0 */
3451                pFromU2022State->g = 0;
3452                *p++ = UCNV_SI;
3453            }
3454
3455            cs = pFromU2022State->cs[0];
3456            if(cs != ASCII && cs != JISX201) {
3457                /* not in ASCII or JIS X 0201: switch to ASCII */
3458                pFromU2022State->cs[0] = (int8_t)ASCII;
3459                *p++ = '\x1b';
3460                *p++ = '\x28';
3461                *p++ = '\x42';
3462            }
3463
3464            *p++ = subchar[0];
3465            break;
3466        }
3467    case 'c':
3468        if(pFromU2022State->g != 0) {
3469            /* not in ASCII mode: switch to ASCII */
3470            pFromU2022State->g = 0;
3471            *p++ = UCNV_SI;
3472        }
3473        *p++ = subchar[0];
3474        break;
3475    case 'k':
3476        if(myConverterData->version == 0) {
3477            if(length == 1) {
3478                if((UBool)args->converter->fromUnicodeStatus) {
3479                    /* in DBCS mode: switch to SBCS */
3480                    args->converter->fromUnicodeStatus = 0;
3481                    *p++ = UCNV_SI;
3482                }
3483                *p++ = subchar[0];
3484            } else /* length == 2*/ {
3485                if(!(UBool)args->converter->fromUnicodeStatus) {
3486                    /* in SBCS mode: switch to DBCS */
3487                    args->converter->fromUnicodeStatus = 1;
3488                    *p++ = UCNV_SO;
3489                }
3490                *p++ = subchar[0];
3491                *p++ = subchar[1];
3492            }
3493            break;
3494        } else {
3495            /* save the subconverter's substitution string */
3496            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3497            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3498
3499            /* set our substitution string into the subconverter */
3500            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3501            myConverterData->currentConverter->subCharLen = (int8_t)length;
3502
3503            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3504            args->converter = myConverterData->currentConverter;
3505            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3506            ucnv_cbFromUWriteSub(args, 0, err);
3507            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3508            args->converter = cnv;
3509
3510            /* restore the subconverter's substitution string */
3511            myConverterData->currentConverter->subChars = currentSubChars;
3512            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3513
3514            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3515                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3516                    uprv_memcpy(
3517                        cnv->charErrorBuffer,
3518                        myConverterData->currentConverter->charErrorBuffer,
3519                        myConverterData->currentConverter->charErrorBufferLength);
3520                }
3521                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3522                myConverterData->currentConverter->charErrorBufferLength = 0;
3523            }
3524            return;
3525        }
3526    default:
3527        /* not expected */
3528        break;
3529    }
3530    ucnv_cbFromUWriteBytes(args,
3531                           buffer, (int32_t)(p - buffer),
3532                           offsetIndex, err);
3533}
3534
3535/*
3536 * Structure for cloning an ISO 2022 converter into a single memory block.
3537 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3538 * and then ucnv_safeClone() of the sub-converter may additionally align
3539 * currentConverter inside the cloneStruct, for which we need the deadSpace
3540 * after currentConverter.
3541 * This is because UAlignedMemory may be larger than the actually
3542 * necessary alignment size for the platform.
3543 * The other cloneStruct fields will not be moved around,
3544 * and are aligned properly with cloneStruct's alignment.
3545 */
3546struct cloneStruct
3547{
3548    UConverter cnv;
3549    UConverter currentConverter;
3550    UAlignedMemory deadSpace;
3551    UConverterDataISO2022 mydata;
3552};
3553
3554
3555static UConverter *
3556_ISO_2022_SafeClone(
3557            const UConverter *cnv,
3558            void *stackBuffer,
3559            int32_t *pBufferSize,
3560            UErrorCode *status)
3561{
3562    struct cloneStruct * localClone;
3563    UConverterDataISO2022 *cnvData;
3564    int32_t i, size;
3565
3566    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3567        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3568        return NULL;
3569    }
3570
3571    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3572    localClone = (struct cloneStruct *)stackBuffer;
3573
3574    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3575
3576    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3577    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3578    localClone->cnv.isExtraLocal = TRUE;
3579
3580    /* share the subconverters */
3581
3582    if(cnvData->currentConverter != NULL) {
3583        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3584        localClone->mydata.currentConverter =
3585            ucnv_safeClone(cnvData->currentConverter,
3586                            &localClone->currentConverter,
3587                            &size, status);
3588        if(U_FAILURE(*status)) {
3589            return NULL;
3590        }
3591    }
3592
3593    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3594        if(cnvData->myConverterArray[i] != NULL) {
3595            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3596        }
3597    }
3598
3599    return &localClone->cnv;
3600}
3601
3602static void
3603_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3604                    const USetAdder *sa,
3605                    UConverterUnicodeSet which,
3606                    UErrorCode *pErrorCode)
3607{
3608    int32_t i;
3609    UConverterDataISO2022* cnvData;
3610
3611    if (U_FAILURE(*pErrorCode)) {
3612        return;
3613    }
3614#ifdef U_ENABLE_GENERIC_ISO_2022
3615    if (cnv->sharedData == &_ISO2022Data) {
3616        /* We use UTF-8 in this case */
3617        sa->addRange(sa->set, 0, 0xd7FF);
3618        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3619        return;
3620    }
3621#endif
3622
3623    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3624
3625    /* open a set and initialize it with code points that are algorithmically round-tripped */
3626    switch(cnvData->locale[0]){
3627    case 'j':
3628        /* include JIS X 0201 which is hardcoded */
3629        sa->add(sa->set, 0xa5);
3630        sa->add(sa->set, 0x203e);
3631        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3632            /* include Latin-1 for some variants of JP */
3633            sa->addRange(sa->set, 0, 0xff);
3634        } else {
3635            /* include ASCII for JP */
3636            sa->addRange(sa->set, 0, 0x7f);
3637        }
3638        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3639            /*
3640             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3641             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3642             * use half-width Katakana.
3643             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3644             * half-width Katakana via the ESC ( I sequence.
3645             * However, we only emit (fromUnicode) half-width Katakana according to the
3646             * definition of each variant.
3647             *
3648             * When including fallbacks,
3649             * we need to include half-width Katakana Unicode code points for all JP variants because
3650             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3651             */
3652            /* include half-width Katakana for JP */
3653            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3654        }
3655        break;
3656    case 'c':
3657    case 'z':
3658        /* include ASCII for CN */
3659        sa->addRange(sa->set, 0, 0x7f);
3660        break;
3661    case 'k':
3662        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3663        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3664                cnvData->currentConverter, sa, which, pErrorCode);
3665        /* the loop over myConverterArray[] will simply not find another converter */
3666        break;
3667    default:
3668        break;
3669    }
3670
3671#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3672            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3673                cnvData->version==0 && i==CNS_11643
3674            ) {
3675                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3676                ucnv_MBCSGetUnicodeSetForBytes(
3677                        cnvData->myConverterArray[i],
3678                        sa, UCNV_ROUNDTRIP_SET,
3679                        0, 0x81, 0x82,
3680                        pErrorCode);
3681            }
3682#endif
3683
3684    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3685        UConverterSetFilter filter;
3686        if(cnvData->myConverterArray[i]!=NULL) {
3687            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3688                cnvData->version==0 && i==CNS_11643
3689            ) {
3690                /*
3691                 * Version-specific for CN:
3692                 * CN version 0 does not map CNS planes 3..7 although
3693                 * they are all available in the CNS conversion table;
3694                 * CN version 1 (-EXT) does map them all.
3695                 * The two versions create different Unicode sets.
3696                 */
3697                filter=UCNV_SET_FILTER_2022_CN;
3698            } else if(cnvData->locale[0]=='j' && i==JISX208) {
3699                /*
3700                 * Only add code points that map to Shift-JIS codes
3701                 * corresponding to JIS X 0208.
3702                 */
3703                filter=UCNV_SET_FILTER_SJIS;
3704            } else if(i==KSC5601) {
3705                /*
3706                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3707                 * are broader than GR94.
3708                 */
3709                filter=UCNV_SET_FILTER_GR94DBCS;
3710            } else {
3711                filter=UCNV_SET_FILTER_NONE;
3712            }
3713            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3714        }
3715    }
3716
3717    /*
3718     * ISO 2022 converters must not convert SO/SI/ESC despite what
3719     * sub-converters do by themselves.
3720     * Remove these characters from the set.
3721     */
3722    sa->remove(sa->set, 0x0e);
3723    sa->remove(sa->set, 0x0f);
3724    sa->remove(sa->set, 0x1b);
3725
3726    /* ISO 2022 converters do not convert C1 controls either */
3727    sa->removeRange(sa->set, 0x80, 0x9f);
3728}
3729
3730static const UConverterImpl _ISO2022Impl={
3731    UCNV_ISO_2022,
3732
3733    NULL,
3734    NULL,
3735
3736    _ISO2022Open,
3737    _ISO2022Close,
3738    _ISO2022Reset,
3739
3740#ifdef U_ENABLE_GENERIC_ISO_2022
3741    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3742    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3743    ucnv_fromUnicode_UTF8,
3744    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3745#else
3746    NULL,
3747    NULL,
3748    NULL,
3749    NULL,
3750#endif
3751    NULL,
3752
3753    NULL,
3754    _ISO2022getName,
3755    _ISO_2022_WriteSub,
3756    _ISO_2022_SafeClone,
3757    _ISO_2022_GetUnicodeSet
3758};
3759static const UConverterStaticData _ISO2022StaticData={
3760    sizeof(UConverterStaticData),
3761    "ISO_2022",
3762    2022,
3763    UCNV_IBM,
3764    UCNV_ISO_2022,
3765    1,
3766    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3767    { 0x1a, 0, 0, 0 },
3768    1,
3769    FALSE,
3770    FALSE,
3771    0,
3772    0,
3773    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3774};
3775const UConverterSharedData _ISO2022Data={
3776    sizeof(UConverterSharedData),
3777    ~((uint32_t) 0),
3778    NULL,
3779    NULL,
3780    &_ISO2022StaticData,
3781    FALSE,
3782    &_ISO2022Impl,
3783    0
3784};
3785
3786/*************JP****************/
3787static const UConverterImpl _ISO2022JPImpl={
3788    UCNV_ISO_2022,
3789
3790    NULL,
3791    NULL,
3792
3793    _ISO2022Open,
3794    _ISO2022Close,
3795    _ISO2022Reset,
3796
3797    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3798    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3799    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3800    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3801    NULL,
3802
3803    NULL,
3804    _ISO2022getName,
3805    _ISO_2022_WriteSub,
3806    _ISO_2022_SafeClone,
3807    _ISO_2022_GetUnicodeSet
3808};
3809static const UConverterStaticData _ISO2022JPStaticData={
3810    sizeof(UConverterStaticData),
3811    "ISO_2022_JP",
3812    0,
3813    UCNV_IBM,
3814    UCNV_ISO_2022,
3815    1,
3816    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3817    { 0x1a, 0, 0, 0 },
3818    1,
3819    FALSE,
3820    FALSE,
3821    0,
3822    0,
3823    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3824};
3825static const UConverterSharedData _ISO2022JPData={
3826    sizeof(UConverterSharedData),
3827    ~((uint32_t) 0),
3828    NULL,
3829    NULL,
3830    &_ISO2022JPStaticData,
3831    FALSE,
3832    &_ISO2022JPImpl,
3833    0
3834};
3835
3836/************* KR ***************/
3837static const UConverterImpl _ISO2022KRImpl={
3838    UCNV_ISO_2022,
3839
3840    NULL,
3841    NULL,
3842
3843    _ISO2022Open,
3844    _ISO2022Close,
3845    _ISO2022Reset,
3846
3847    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3848    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3849    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3850    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3851    NULL,
3852
3853    NULL,
3854    _ISO2022getName,
3855    _ISO_2022_WriteSub,
3856    _ISO_2022_SafeClone,
3857    _ISO_2022_GetUnicodeSet
3858};
3859static const UConverterStaticData _ISO2022KRStaticData={
3860    sizeof(UConverterStaticData),
3861    "ISO_2022_KR",
3862    0,
3863    UCNV_IBM,
3864    UCNV_ISO_2022,
3865    1,
3866    3, /* max 3 bytes per UChar: SO+DBCS */
3867    { 0x1a, 0, 0, 0 },
3868    1,
3869    FALSE,
3870    FALSE,
3871    0,
3872    0,
3873    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3874};
3875static const UConverterSharedData _ISO2022KRData={
3876    sizeof(UConverterSharedData),
3877    ~((uint32_t) 0),
3878    NULL,
3879    NULL,
3880    &_ISO2022KRStaticData,
3881    FALSE,
3882    &_ISO2022KRImpl,
3883    0
3884};
3885
3886/*************** CN ***************/
3887static const UConverterImpl _ISO2022CNImpl={
3888
3889    UCNV_ISO_2022,
3890
3891    NULL,
3892    NULL,
3893
3894    _ISO2022Open,
3895    _ISO2022Close,
3896    _ISO2022Reset,
3897
3898    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3899    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3900    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3901    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3902    NULL,
3903
3904    NULL,
3905    _ISO2022getName,
3906    _ISO_2022_WriteSub,
3907    _ISO_2022_SafeClone,
3908    _ISO_2022_GetUnicodeSet
3909};
3910static const UConverterStaticData _ISO2022CNStaticData={
3911    sizeof(UConverterStaticData),
3912    "ISO_2022_CN",
3913    0,
3914    UCNV_IBM,
3915    UCNV_ISO_2022,
3916    1,
3917    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3918    { 0x1a, 0, 0, 0 },
3919    1,
3920    FALSE,
3921    FALSE,
3922    0,
3923    0,
3924    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3925};
3926static const UConverterSharedData _ISO2022CNData={
3927    sizeof(UConverterSharedData),
3928    ~((uint32_t) 0),
3929    NULL,
3930    NULL,
3931    &_ISO2022CNStaticData,
3932    FALSE,
3933    &_ISO2022CNImpl,
3934    0
3935};
3936
3937
3938
3939#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3940