1/*
2**********************************************************************
3*   Copyright (C) 2000-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv2022.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2000feb03
12*   created by: Markus W. Scherer
13*
14*   Change history:
15*
16*   06/29/2000  helena  Major rewrite of the callback APIs.
17*   08/08/2000  Ram     Included support for ISO-2022-JP-2
18*                       Changed implementation of toUnicode
19*                       function
20*   08/21/2000  Ram     Added support for ISO-2022-KR
21*   08/29/2000  Ram     Seperated implementation of EBCDIC to
22*                       ucnvebdc.c
23*   09/20/2000  Ram     Added support for ISO-2022-CN
24*                       Added implementations for getNextUChar()
25*                       for specific 2022 country variants.
26*   10/31/2000  Ram     Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
37#include "unicode/utf16.h"
38#include "ucnv_imp.h"
39#include "ucnv_bld.h"
40#include "ucnv_cnv.h"
41#include "ucnvmbcs.h"
42#include "cstring.h"
43#include "cmemory.h"
44#include "uassert.h"
45
46#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 *    its designation sequences, single shifts with return to the previous state,
56 *    switch-with-no-return to UTF-16BE or similar, etc.
57 *    This is unlike the language-specific variants like ISO-2022-JP which
58 *    require a much smaller repertoire of ISO-2022 features.
59 *    These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 *    but rather always one of the language-specific variants.
62 *    Note that ICU's generic ISO-2022 converter has always output one escape
63 *    sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 *    the previous converter is closed and a new one opened,
66 *    without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 *    reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 *    This means, for example, that when ISO-8859-7 is designated, the following
71 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 *    The ICU ISO-2022 converter does not handle this - and has no information
73 *    about which subconverter would have to be shifted vs. which is designed
74 *    for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
80static const char SHIFT_IN_STR[]  = "\x0F";
81// static const char SHIFT_OUT_STR[] = "\x0E";
82
83#define CR      0x0D
84#define LF      0x0A
85#define H_TAB   0x09
86#define V_TAB   0x0B
87#define SPACE   0x20
88
89enum {
90    HWKANA_START=0xff61,
91    HWKANA_END=0xff9f
92};
93
94/*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102enum {
103    GR94_START=0xa1,
104    GR94_END=0xfe,
105    GR96_START=0xa0,
106    GR96_END=0xff
107};
108
109/*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117/* for ISO-2022-JP and -CN implementations */
118typedef enum  {
119        /* shared values */
120        INVALID_STATE=-1,
121        ASCII = 0,
122
123        SS2_STATE=0x10,
124        SS3_STATE,
125
126        /* JP */
127        ISO8859_1 = 1 ,
128        ISO8859_7 = 2 ,
129        JISX201  = 3,
130        JISX208 = 4,
131        JISX212 = 5,
132        GB2312  =6,
133        KSC5601 =7,
134        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
135
136        /* CN */
137        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138        GB2312_1=1,
139        ISO_IR_165=2,
140        CNS_11643=3,
141
142        /*
143         * these are used in StateEnum and ISO2022State variables,
144         * but CNS_11643 must be used to index into myConverterArray[]
145         */
146        CNS_11643_0=0x20,
147        CNS_11643_1,
148        CNS_11643_2,
149        CNS_11643_3,
150        CNS_11643_4,
151        CNS_11643_5,
152        CNS_11643_6,
153        CNS_11643_7
154} StateEnum;
155
156/* is the StateEnum charset value for a DBCS charset? */
157#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159#define CSM(cs) ((uint16_t)1<<(cs))
160
161/*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 *   all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
170enum { MAX_JA_VERSION=4 };
171static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
172    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
173    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
174    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
175    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
176    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
177};
178
179typedef enum {
180        ASCII1=0,
181        LATIN1,
182        SBCS,
183        DBCS,
184        MBCS,
185        HWKANA
186}Cnv2022Type;
187
188typedef struct ISO2022State {
189    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
190    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
191    int8_t prevG;       /* g before single shift (SS2 or SS3) */
192} ISO2022State;
193
194#define UCNV_OPTIONS_VERSION_MASK 0xf
195#define UCNV_2022_MAX_CONVERTERS 10
196
197typedef struct{
198    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
199    UConverter *currentConverter;
200    Cnv2022Type currentType;
201    ISO2022State toU2022State, fromU2022State;
202    uint32_t key;
203    uint32_t version;
204#ifdef U_ENABLE_GENERIC_ISO_2022
205    UBool isFirstBuffer;
206#endif
207    UBool isEmptySegment;
208    char name[30];
209    char locale[3];
210}UConverterDataISO2022;
211
212/* Protos */
213/* ISO-2022 ----------------------------------------------------------------- */
214
215/*Forward declaration */
216U_CFUNC void
217ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
218                      UErrorCode * err);
219U_CFUNC void
220ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
221                                    UErrorCode * err);
222
223#define ESC_2022 0x1B /*ESC*/
224
225typedef enum
226{
227        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
228        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
229        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
230        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
231} UCNV_TableStates_2022;
232
233/*
234* The way these state transition arrays work is:
235* ex : ESC$B is the sequence for JISX208
236*      a) First Iteration: char is ESC
237*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
238*             int x = normalize_esq_chars_2022[27] which is equal to 1
239*         ii) Search for this value in escSeqStateTable_Key_2022[]
240*             value of x is stored at escSeqStateTable_Key_2022[0]
241*        iii) Save this index as offset
242*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
243*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
244*     b) Switch on this state and continue to next char
245*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
246*             which is normalize_esq_chars_2022[36] == 4
247*         ii) x is currently 1(from above)
248*               x<<=5 -- x is now 32
249*               x+=normalize_esq_chars_2022[36]
250*               now x is 36
251*        iii) Search for this value in escSeqStateTable_Key_2022[]
252*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
253*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
254*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
255*     c) Switch on this state and continue to next char
256*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
257*        ii) x is currently 36 (from above)
258*            x<<=5 -- x is now 1152
259*            x+=normalize_esq_chars_2022[66]
260*            now x is 1161
261*       iii) Search for this value in escSeqStateTable_Key_2022[]
262*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
263*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
264*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
265*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266*/
267
268
269/*Below are the 3 arrays depicting a state transition table*/
270static const int8_t normalize_esq_chars_2022[256] = {
271/*       0      1       2       3       4      5       6        7       8       9           */
272
273         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
274        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
275        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
276        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
277        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
278        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
279        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
280        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
281        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
282        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
283        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
284        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298        ,0     ,0      ,0      ,0      ,0      ,0
299};
300
301#ifdef U_ENABLE_GENERIC_ISO_2022
302/*
303 * When the generic ISO-2022 converter is completely removed, not just disabled
304 * per #ifdef, then the following state table and the associated tables that are
305 * dimensioned with MAX_STATES_2022 should be trimmed.
306 *
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
308 * the associated escape sequences starting with ESC ( B should be removed.
309 * This includes the ones with key values 1097 and all of the ones above 1000000.
310 *
311 * For the latter, the tables can simply be truncated.
312 * For the former, since the tables must be kept parallel, it is probably best
313 * to simply duplicate an adjacent table cell, parallel in all tables.
314 *
315 * It may make sense to restructure the tables, especially by using small search
316 * tables for the variants instead of indexing them parallel to the table here.
317 */
318#endif
319
320#define MAX_STATES_2022 74
321static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
322/*   0           1           2           3           4           5           6           7           8           9           */
323
324     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
325    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
326    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
327    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
328    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
329    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
330    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
331    ,35947631   ,35947635   ,35947636   ,35947638
332};
333
334#ifdef U_ENABLE_GENERIC_ISO_2022
335
336static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
337 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
338
339     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
340    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
341    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
342    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
343    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
344    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
345    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
346    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
347};
348
349#endif
350
351static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
352/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
353     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
354    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
355    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
356    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
357    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
358    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
359    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
360    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
361};
362
363
364/* Type def for refactoring changeState_2022 code*/
365typedef enum{
366#ifdef U_ENABLE_GENERIC_ISO_2022
367    ISO_2022=0,
368#endif
369    ISO_2022_JP=1,
370    ISO_2022_KR=2,
371    ISO_2022_CN=3
372} Variant2022;
373
374/*********** ISO 2022 Converter Protos ***********/
375static void
376_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
377
378static void
379 _ISO2022Close(UConverter *converter);
380
381static void
382_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
383
384static const char*
385_ISO2022getName(const UConverter* cnv);
386
387static void
388_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
389
390static UConverter *
391_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
392
393#ifdef U_ENABLE_GENERIC_ISO_2022
394static void
395T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
396#endif
397
398namespace {
399
400/*const UConverterSharedData _ISO2022Data;*/
401extern const UConverterSharedData _ISO2022JPData;
402extern const UConverterSharedData _ISO2022KRData;
403extern const UConverterSharedData _ISO2022CNData;
404
405}  // namespace
406
407/*************** Converter implementations ******************/
408
409/* The purpose of this function is to get around gcc compiler warnings. */
410static inline void
411fromUWriteUInt8(UConverter *cnv,
412                 const char *bytes, int32_t length,
413                 uint8_t **target, const char *targetLimit,
414                 int32_t **offsets,
415                 int32_t sourceIndex,
416                 UErrorCode *pErrorCode)
417{
418    char *targetChars = (char *)*target;
419    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
420                         offsets, sourceIndex, pErrorCode);
421    *target = (uint8_t*)targetChars;
422
423}
424
425static inline void
426setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
427    if(myConverterData->version == 1) {
428        UConverter *cnv = myConverterData->currentConverter;
429
430        cnv->toUnicodeStatus=0;     /* offset */
431        cnv->mode=0;                /* state */
432        cnv->toULength=0;           /* byteIndex */
433    }
434}
435
436static inline void
437setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
438   /* in ISO-2022-KR the designator sequence appears only once
439    * in a file so we append it only once
440    */
441    if( converter->charErrorBufferLength==0){
442
443        converter->charErrorBufferLength = 4;
444        converter->charErrorBuffer[0] = 0x1b;
445        converter->charErrorBuffer[1] = 0x24;
446        converter->charErrorBuffer[2] = 0x29;
447        converter->charErrorBuffer[3] = 0x43;
448    }
449    if(myConverterData->version == 1) {
450        UConverter *cnv = myConverterData->currentConverter;
451
452        cnv->fromUChar32=0;
453        cnv->fromUnicodeStatus=1;   /* prevLength */
454    }
455}
456
457static void
458_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
459
460    char myLocale[6]={' ',' ',' ',' ',' ',' '};
461
462    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
463    if(cnv->extraInfo != NULL) {
464        UConverterNamePieces stackPieces;
465        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
466        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
467        uint32_t version;
468
469        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
470
471        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
472        myConverterData->currentType = ASCII1;
473        cnv->fromUnicodeStatus =FALSE;
474        if(pArgs->locale){
475            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
476        }
477        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
478        myConverterData->version = version;
479        /* Begin Google-specific change. */
480        /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
481        /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
482        if((myLocale[0]=='j' &&
483            (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
484             myLocale[1]=='s') &&
485            (myLocale[2]=='_' || myLocale[2]=='\0')))
486        {
487            size_t len=0;
488            /* open the required converters and cache them */
489            if(version>MAX_JA_VERSION) {
490                /* prevent indexing beyond jpCharsetMasks[] */
491                myConverterData->version = version = 0;
492            }
493            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
494                myConverterData->myConverterArray[ISO8859_7] =
495                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
496            }
497            if (myLocale[1]=='k') {  /* Use KDDI's version. */
498                myConverterData->myConverterArray[JISX208]  =
499                    ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
500            } else if (myLocale[1]=='s') {  /* Use SoftBank's version. */
501                myConverterData->myConverterArray[JISX208]  =
502                    ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
503            } else {
504                /*
505                 * Change for http://b/issue?id=937017 :
506                 * Restore JIS X 0208 ISO-2022-JP mappings from before
507                 * sharing the table with the Shift-JIS converter
508                 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797).
509                 * TODO(mscherer): Create and use a new, unified Google Shift-JIS
510                 * table for both Shift-JIS and ISO-2022-JP.
511                 */
512                myConverterData->myConverterArray[JISX208]  =
513                    ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode);
514            }
515            /* End Google-specific change. */
516            if(jpCharsetMasks[version]&CSM(JISX212)) {
517                myConverterData->myConverterArray[JISX212] =
518                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
519            }
520            if(jpCharsetMasks[version]&CSM(GB2312)) {
521                myConverterData->myConverterArray[GB2312] =
522                    /* BEGIN android-changed */
523                    ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
524                    /* END android-changed */
525            }
526            if(jpCharsetMasks[version]&CSM(KSC5601)) {
527                myConverterData->myConverterArray[KSC5601] =
528                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
529            }
530
531            /* set the function pointers to appropriate funtions */
532            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
533            uprv_strcpy(myConverterData->locale,"ja");
534
535            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
536            len = uprv_strlen(myConverterData->name);
537            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
538            myConverterData->name[len+1]='\0';
539        }
540        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541            (myLocale[2]=='_' || myLocale[2]=='\0'))
542        {
543            const char *cnvName;
544            if(version==1) {
545                cnvName="icu-internal-25546";
546            } else {
547                /* BEGIN android-changed */
548                cnvName="ksc_5601";
549                /* END android-changed */
550                myConverterData->version=version=0;
551            }
552            if(pArgs->onlyTestIsLoadable) {
553                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
554                uprv_free(cnv->extraInfo);
555                cnv->extraInfo=NULL;
556                return;
557            } else {
558                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
559                if (U_FAILURE(*errorCode)) {
560                    _ISO2022Close(cnv);
561                    return;
562                }
563
564                if(version==1) {
565                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
566                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
567                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
568                }else{
569                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
570                }
571
572                /* initialize the state variables */
573                setInitialStateToUnicodeKR(cnv, myConverterData);
574                setInitialStateFromUnicodeKR(cnv, myConverterData);
575
576                /* set the function pointers to appropriate funtions */
577                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
578                uprv_strcpy(myConverterData->locale,"ko");
579            }
580        }
581        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
582            (myLocale[2]=='_' || myLocale[2]=='\0'))
583        {
584
585            /* open the required converters and cache them */
586            /* BEGIN android-changed */
587            myConverterData->myConverterArray[GB2312_1] =
588                ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
589            if(version==1) {
590                myConverterData->myConverterArray[ISO_IR_165] =
591                    ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
592            }
593            myConverterData->myConverterArray[CNS_11643] =
594                ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
595            /* END android-changed */
596
597
598            /* set the function pointers to appropriate funtions */
599            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
600            uprv_strcpy(myConverterData->locale,"cn");
601
602            if (version==0){
603                myConverterData->version = 0;
604                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
605            }else if (version==1){
606                myConverterData->version = 1;
607                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
608            }else {
609                myConverterData->version = 2;
610                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
611            }
612        }
613        else{
614#ifdef U_ENABLE_GENERIC_ISO_2022
615            myConverterData->isFirstBuffer = TRUE;
616
617            /* append the UTF-8 escape sequence */
618            cnv->charErrorBufferLength = 3;
619            cnv->charErrorBuffer[0] = 0x1b;
620            cnv->charErrorBuffer[1] = 0x25;
621            cnv->charErrorBuffer[2] = 0x42;
622
623            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
624            /* initialize the state variables */
625            uprv_strcpy(myConverterData->name,"ISO_2022");
626#else
627            *errorCode = U_UNSUPPORTED_ERROR;
628            return;
629#endif
630        }
631
632        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
633
634        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
635            _ISO2022Close(cnv);
636        }
637    } else {
638        *errorCode = U_MEMORY_ALLOCATION_ERROR;
639    }
640}
641
642
643static void
644_ISO2022Close(UConverter *converter) {
645    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
646    UConverterSharedData **array = myData->myConverterArray;
647    int32_t i;
648
649    if (converter->extraInfo != NULL) {
650        /*close the array of converter pointers and free the memory*/
651        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
652            if(array[i]!=NULL) {
653                ucnv_unloadSharedDataIfReady(array[i]);
654            }
655        }
656
657        ucnv_close(myData->currentConverter);
658
659        if(!converter->isExtraLocal){
660            uprv_free (converter->extraInfo);
661            converter->extraInfo = NULL;
662        }
663    }
664}
665
666static void
667_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
668    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
669    if(choice<=UCNV_RESET_TO_UNICODE) {
670        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
671        myConverterData->key = 0;
672        myConverterData->isEmptySegment = FALSE;
673    }
674    if(choice!=UCNV_RESET_TO_UNICODE) {
675        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
676    }
677#ifdef U_ENABLE_GENERIC_ISO_2022
678    if(myConverterData->locale[0] == 0){
679        if(choice<=UCNV_RESET_TO_UNICODE) {
680            myConverterData->isFirstBuffer = TRUE;
681            myConverterData->key = 0;
682            if (converter->mode == UCNV_SO){
683                ucnv_close (myConverterData->currentConverter);
684                myConverterData->currentConverter=NULL;
685            }
686            converter->mode = UCNV_SI;
687        }
688        if(choice!=UCNV_RESET_TO_UNICODE) {
689            /* re-append UTF-8 escape sequence */
690            converter->charErrorBufferLength = 3;
691            converter->charErrorBuffer[0] = 0x1b;
692            converter->charErrorBuffer[1] = 0x28;
693            converter->charErrorBuffer[2] = 0x42;
694        }
695    }
696    else
697#endif
698    {
699        /* reset the state variables */
700        if(myConverterData->locale[0] == 'k'){
701            if(choice<=UCNV_RESET_TO_UNICODE) {
702                setInitialStateToUnicodeKR(converter, myConverterData);
703            }
704            if(choice!=UCNV_RESET_TO_UNICODE) {
705                setInitialStateFromUnicodeKR(converter, myConverterData);
706            }
707        }
708    }
709}
710
711static const char*
712_ISO2022getName(const UConverter* cnv){
713    if(cnv->extraInfo){
714        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
715        return myData->name;
716    }
717    return NULL;
718}
719
720
721/*************** to unicode *******************/
722/****************************************************************************
723 * Recognized escape sequences are
724 * <ESC>(B  ASCII
725 * <ESC>.A  ISO-8859-1
726 * <ESC>.F  ISO-8859-7
727 * <ESC>(J  JISX-201
728 * <ESC>(I  JISX-201
729 * <ESC>$B  JISX-208
730 * <ESC>$@  JISX-208
731 * <ESC>$(D JISX-212
732 * <ESC>$A  GB2312
733 * <ESC>$(C KSC5601
734 */
735static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
736/*      0                1               2               3               4               5               6               7               8               9    */
737    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
738    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
739    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
740    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
741    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
742    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
743    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
744    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
745};
746
747/*************** to unicode *******************/
748static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
749/*      0                1               2               3               4               5               6               7               8               9    */
750     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
751    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
752    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
753    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
754    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
755    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
756    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
758};
759
760
761static UCNV_TableStates_2022
762getKey_2022(char c,int32_t* key,int32_t* offset){
763    int32_t togo;
764    int32_t low = 0;
765    int32_t hi = MAX_STATES_2022;
766    int32_t oldmid=0;
767
768    togo = normalize_esq_chars_2022[(uint8_t)c];
769    if(togo == 0) {
770        /* not a valid character anywhere in an escape sequence */
771        *key = 0;
772        *offset = 0;
773        return INVALID_2022;
774    }
775    togo = (*key << 5) + togo;
776
777    while (hi != low)  /*binary search*/{
778
779        int32_t mid = (hi+low) >> 1; /*Finds median*/
780
781        if (mid == oldmid)
782            break;
783
784        if (escSeqStateTable_Key_2022[mid] > togo){
785            hi = mid;
786        }
787        else if (escSeqStateTable_Key_2022[mid] < togo){
788            low = mid;
789        }
790        else /*we found it*/{
791            *key = togo;
792            *offset = mid;
793            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
794        }
795        oldmid = mid;
796
797    }
798
799    *key = 0;
800    *offset = 0;
801    return INVALID_2022;
802}
803
804/*runs through a state machine to determine the escape sequence - codepage correspondance
805 */
806static void
807changeState_2022(UConverter* _this,
808                const char** source,
809                const char* sourceLimit,
810                Variant2022 var,
811                UErrorCode* err){
812    UCNV_TableStates_2022 value;
813    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
814    uint32_t key = myData2022->key;
815    int32_t offset = 0;
816    int8_t initialToULength = _this->toULength;
817    char c;
818
819    value = VALID_NON_TERMINAL_2022;
820    while (*source < sourceLimit) {
821        c = *(*source)++;
822        _this->toUBytes[_this->toULength++]=(uint8_t)c;
823        value = getKey_2022(c,(int32_t *) &key, &offset);
824
825        switch (value){
826
827        case VALID_NON_TERMINAL_2022 :
828            /* continue with the loop */
829            break;
830
831        case VALID_TERMINAL_2022:
832            key = 0;
833            goto DONE;
834
835        case INVALID_2022:
836            goto DONE;
837
838        case VALID_MAYBE_TERMINAL_2022:
839#ifdef U_ENABLE_GENERIC_ISO_2022
840            /* ESC ( B is ambiguous only for ISO_2022 itself */
841            if(var == ISO_2022) {
842                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
843                _this->toULength = 0;
844
845                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
846
847                /* continue with the loop */
848                value = VALID_NON_TERMINAL_2022;
849                break;
850            } else
851#endif
852            {
853                /* not ISO_2022 itself, finish here */
854                value = VALID_TERMINAL_2022;
855                key = 0;
856                goto DONE;
857            }
858        }
859    }
860
861DONE:
862    myData2022->key = key;
863
864    if (value == VALID_NON_TERMINAL_2022) {
865        /* indicate that the escape sequence is incomplete: key!=0 */
866        return;
867    } else if (value == INVALID_2022 ) {
868        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
869    } else /* value == VALID_TERMINAL_2022 */ {
870        switch(var){
871#ifdef U_ENABLE_GENERIC_ISO_2022
872        case ISO_2022:
873        {
874            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
875            if(chosenConverterName == NULL) {
876                /* SS2 or SS3 */
877                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
878                _this->toUCallbackReason = UCNV_UNASSIGNED;
879                return;
880            }
881
882            _this->mode = UCNV_SI;
883            ucnv_close(myData2022->currentConverter);
884            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
885            if(U_SUCCESS(*err)) {
886                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
887                _this->mode = UCNV_SO;
888            }
889            break;
890        }
891#endif
892        case ISO_2022_JP:
893            {
894                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
895                switch(tempState) {
896                case INVALID_STATE:
897                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
898                    break;
899                case SS2_STATE:
900                    if(myData2022->toU2022State.cs[2]!=0) {
901                        if(myData2022->toU2022State.g<2) {
902                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
903                        }
904                        myData2022->toU2022State.g=2;
905                    } else {
906                        /* illegal to have SS2 before a matching designator */
907                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
908                    }
909                    break;
910                /* case SS3_STATE: not used in ISO-2022-JP-x */
911                case ISO8859_1:
912                case ISO8859_7:
913                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
914                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
915                    } else {
916                        /* G2 charset for SS2 */
917                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
918                    }
919                    break;
920                default:
921                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
922                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923                    } else {
924                        /* G0 charset */
925                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
926                    }
927                    break;
928                }
929            }
930            break;
931        case ISO_2022_CN:
932            {
933                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
934                switch(tempState) {
935                case INVALID_STATE:
936                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
937                    break;
938                case SS2_STATE:
939                    if(myData2022->toU2022State.cs[2]!=0) {
940                        if(myData2022->toU2022State.g<2) {
941                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
942                        }
943                        myData2022->toU2022State.g=2;
944                    } else {
945                        /* illegal to have SS2 before a matching designator */
946                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
947                    }
948                    break;
949                case SS3_STATE:
950                    if(myData2022->toU2022State.cs[3]!=0) {
951                        if(myData2022->toU2022State.g<2) {
952                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
953                        }
954                        myData2022->toU2022State.g=3;
955                    } else {
956                        /* illegal to have SS3 before a matching designator */
957                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
958                    }
959                    break;
960                case ISO_IR_165:
961                    if(myData2022->version==0) {
962                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
963                        break;
964                    }
965                    /*fall through*/
966                case GB2312_1:
967                    /*fall through*/
968                case CNS_11643_1:
969                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
970                    break;
971                case CNS_11643_2:
972                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
973                    break;
974                default:
975                    /* other CNS 11643 planes */
976                    if(myData2022->version==0) {
977                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
978                    } else {
979                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
980                    }
981                    break;
982                }
983            }
984            break;
985        case ISO_2022_KR:
986            if(offset==0x30){
987                /* nothing to be done, just accept this one escape sequence */
988            } else {
989                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
990            }
991            break;
992
993        default:
994            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
995            break;
996        }
997    }
998    if(U_SUCCESS(*err)) {
999        _this->toULength = 0;
1000    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1001        if(_this->toULength>1) {
1002            /*
1003             * Ticket 5691: consistent illegal sequences:
1004             * - We include at least the first byte (ESC) in the illegal sequence.
1005             * - If any of the non-initial bytes could be the start of a character,
1006             *   we stop the illegal sequence before the first one of those.
1007             *   In escape sequences, all following bytes are "printable", that is,
1008             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1009             *   they are valid single/lead bytes.
1010             *   For simplicity, we always only report the initial ESC byte as the
1011             *   illegal sequence and back out all other bytes we looked at.
1012             */
1013            /* Back out some bytes. */
1014            int8_t backOutDistance=_this->toULength-1;
1015            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1016            if(backOutDistance<=bytesFromThisBuffer) {
1017                /* same as initialToULength<=1 */
1018                *source-=backOutDistance;
1019            } else {
1020                /* Back out bytes from the previous buffer: Need to replay them. */
1021                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1022                /* same as -(initialToULength-1) */
1023                /* preToULength is negative! */
1024                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1025                *source-=bytesFromThisBuffer;
1026            }
1027            _this->toULength=1;
1028        }
1029    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1030        _this->toUCallbackReason = UCNV_UNASSIGNED;
1031    }
1032}
1033
1034/*Checks the characters of the buffer against valid 2022 escape sequences
1035*if the match we return a pointer to the initial start of the sequence otherwise
1036*we return sourceLimit
1037*/
1038/*for 2022 looks ahead in the stream
1039 *to determine the longest possible convertible
1040 *data stream
1041 */
1042static inline const char*
1043getEndOfBuffer_2022(const char** source,
1044                   const char* sourceLimit,
1045                   UBool /*flush*/){
1046
1047    const char* mySource = *source;
1048
1049#ifdef U_ENABLE_GENERIC_ISO_2022
1050    if (*source >= sourceLimit)
1051        return sourceLimit;
1052
1053    do{
1054
1055        if (*mySource == ESC_2022){
1056            int8_t i;
1057            int32_t key = 0;
1058            int32_t offset;
1059            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1060
1061            /* Kludge: I could not
1062            * figure out the reason for validating an escape sequence
1063            * twice - once here and once in changeState_2022().
1064            * is it possible to have an ESC character in a ISO2022
1065            * byte stream which is valid in a code page? Is it legal?
1066            */
1067            for (i=0;
1068            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1069            i++) {
1070                value =  getKey_2022(*(mySource+i), &key, &offset);
1071            }
1072            if (value > 0 || *mySource==ESC_2022)
1073                return mySource;
1074
1075            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1076                return sourceLimit;
1077        }
1078    }while (++mySource < sourceLimit);
1079
1080    return sourceLimit;
1081#else
1082    while(mySource < sourceLimit && *mySource != ESC_2022) {
1083        ++mySource;
1084    }
1085    return mySource;
1086#endif
1087}
1088
1089
1090/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1091 * any future change in _MBCSFromUChar32() function should be reflected here.
1092 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1093 */
1094static inline int32_t
1095MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1096                                         UChar32 c,
1097                                         uint32_t* value,
1098                                         UBool useFallback,
1099                                         int outputType)
1100{
1101    const int32_t *cx;
1102    const uint16_t *table;
1103    uint32_t stage2Entry;
1104    uint32_t myValue;
1105    int32_t length;
1106    const uint8_t *p;
1107    /*
1108     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1109     * Use internal version of ucnv_open() that verifies that the new structures are available,
1110     * else U_INTERNAL_PROGRAM_ERROR.
1111     */
1112    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1113    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1114        table=sharedData->mbcs.fromUnicodeTable;
1115        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1116        /* get the bytes and the length for the output */
1117        if(outputType==MBCS_OUTPUT_2){
1118            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1119            if(myValue<=0xff) {
1120                length=1;
1121            } else {
1122                length=2;
1123            }
1124        } else /* outputType==MBCS_OUTPUT_3 */ {
1125            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1126            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1127            if(myValue<=0xff) {
1128                length=1;
1129            } else if(myValue<=0xffff) {
1130                length=2;
1131            } else {
1132                length=3;
1133            }
1134        }
1135        /* is this code point assigned, or do we use fallbacks? */
1136        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1137            /* assigned */
1138            *value=myValue;
1139            return length;
1140        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1141            /*
1142             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1143             * There is no way with this data structure for fallback output
1144             * to be a zero byte.
1145             */
1146            *value=myValue;
1147            return -length;
1148        }
1149    }
1150
1151    cx=sharedData->mbcs.extIndexes;
1152    if(cx!=NULL) {
1153        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1154    }
1155
1156    /* unassigned */
1157    return 0;
1158}
1159
1160/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1161 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1162 * @param retval pointer to output byte
1163 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1164 */
1165static inline int32_t
1166MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1167                                       UChar32 c,
1168                                       uint32_t* retval,
1169                                       UBool useFallback)
1170{
1171    const uint16_t *table;
1172    int32_t value;
1173    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1174    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1175        return 0;
1176    }
1177    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1178    table=sharedData->mbcs.fromUnicodeTable;
1179    /* get the byte for the output */
1180    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1181    /* is this code point assigned, or do we use fallbacks? */
1182    *retval=(uint32_t)(value&0xff);
1183    if(value>=0xf00) {
1184        return 1;  /* roundtrip */
1185    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1186        return -1;  /* fallback taken */
1187    } else {
1188        return 0;  /* no mapping */
1189    }
1190}
1191
1192/*
1193 * Check that the result is a 2-byte value with each byte in the range A1..FE
1194 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1195 * to move it to the ISO 2022 range 21..7E.
1196 * Return 0 if out of range.
1197 */
1198static inline uint32_t
1199_2022FromGR94DBCS(uint32_t value) {
1200    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1201        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1202    ) {
1203        return value - 0x8080;  /* shift down to 21..7e byte range */
1204    } else {
1205        return 0;  /* not valid for ISO 2022 */
1206    }
1207}
1208
1209#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1210/*
1211 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1212 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1213 * unchanged.
1214 */
1215static inline uint32_t
1216_2022ToGR94DBCS(uint32_t value) {
1217    uint32_t returnValue = value + 0x8080;
1218    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1219        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1220        return returnValue;
1221    } else {
1222        return value;
1223    }
1224}
1225#endif
1226
1227#ifdef U_ENABLE_GENERIC_ISO_2022
1228
1229/**********************************************************************************
1230*  ISO-2022 Converter
1231*
1232*
1233*/
1234
1235static void
1236T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1237                                                           UErrorCode* err){
1238    const char* mySourceLimit, *realSourceLimit;
1239    const char* sourceStart;
1240    const UChar* myTargetStart;
1241    UConverter* saveThis;
1242    UConverterDataISO2022* myData;
1243    int8_t length;
1244
1245    saveThis = args->converter;
1246    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1247
1248    realSourceLimit = args->sourceLimit;
1249    while (args->source < realSourceLimit) {
1250        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1251            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1252            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1253
1254            if(args->source < mySourceLimit) {
1255                if(myData->currentConverter==NULL) {
1256                    myData->currentConverter = ucnv_open("ASCII",err);
1257                    if(U_FAILURE(*err)){
1258                        return;
1259                    }
1260
1261                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1262                    saveThis->mode = UCNV_SO;
1263                }
1264
1265                /* convert to before the ESC or until the end of the buffer */
1266                myData->isFirstBuffer=FALSE;
1267                sourceStart = args->source;
1268                myTargetStart = args->target;
1269                args->converter = myData->currentConverter;
1270                ucnv_toUnicode(args->converter,
1271                    &args->target,
1272                    args->targetLimit,
1273                    &args->source,
1274                    mySourceLimit,
1275                    args->offsets,
1276                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1277                    err);
1278                args->converter = saveThis;
1279
1280                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1281                    /* move the overflow buffer */
1282                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1283                    myData->currentConverter->UCharErrorBufferLength = 0;
1284                    if(length > 0) {
1285                        uprv_memcpy(saveThis->UCharErrorBuffer,
1286                                    myData->currentConverter->UCharErrorBuffer,
1287                                    length*U_SIZEOF_UCHAR);
1288                    }
1289                    return;
1290                }
1291
1292                /*
1293                 * At least one of:
1294                 * -Error while converting
1295                 * -Done with entire buffer
1296                 * -Need to write offsets or update the current offset
1297                 *  (leave that up to the code in ucnv.c)
1298                 *
1299                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1300                 */
1301                if (U_FAILURE(*err) ||
1302                    (args->source == realSourceLimit) ||
1303                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1304                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1305                ) {
1306                    /* copy partial or error input for truncated detection and error handling */
1307                    if(U_FAILURE(*err)) {
1308                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1309                        if(length > 0) {
1310                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1311                        }
1312                    } else {
1313                        length = saveThis->toULength = myData->currentConverter->toULength;
1314                        if(length > 0) {
1315                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1316                            if(args->source < mySourceLimit) {
1317                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1318                            }
1319                        }
1320                    }
1321                    return;
1322                }
1323            }
1324        }
1325
1326        sourceStart = args->source;
1327        changeState_2022(args->converter,
1328               &(args->source),
1329               realSourceLimit,
1330               ISO_2022,
1331               err);
1332        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1333            /* let the ucnv.c code update its current offset */
1334            return;
1335        }
1336    }
1337}
1338
1339#endif
1340
1341/*
1342 * To Unicode Callback helper function
1343 */
1344static void
1345toUnicodeCallback(UConverter *cnv,
1346                  const uint32_t sourceChar, const uint32_t targetUniChar,
1347                  UErrorCode* err){
1348    if(sourceChar>0xff){
1349        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1350        cnv->toUBytes[1] = (uint8_t)sourceChar;
1351        cnv->toULength = 2;
1352    }
1353    else{
1354        cnv->toUBytes[0] =(char) sourceChar;
1355        cnv->toULength = 1;
1356    }
1357
1358    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1359        *err = U_INVALID_CHAR_FOUND;
1360    }
1361    else{
1362        *err = U_ILLEGAL_CHAR_FOUND;
1363    }
1364}
1365
1366/**************************************ISO-2022-JP*************************************************/
1367
1368/************************************** IMPORTANT **************************************************
1369* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1370* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1371* The converter iterates over each Unicode codepoint
1372* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1373* processed one char at a time it would make sense to reduce the extra processing a canned converter
1374* would do as far as possible.
1375*
1376* If the implementation of these macros or structure of sharedData struct change in the future, make
1377* sure that ISO-2022 is also changed.
1378***************************************************************************************************
1379*/
1380
1381/***************************************************************************************************
1382* Rules for ISO-2022-jp encoding
1383* (i)   Escape sequences must be fully contained within a line they should not
1384*       span new lines or CRs
1385* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1386*       JIS-Roman character escape sequence should follow before the line terminates
1387* (iii) If the first character on the line is represented by two bytes then a two
1388*       byte character escape sequence should precede it
1389* (iv)  If no escape sequence is encountered then the characters are ASCII
1390* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1391*       and invoked with SS2 (ESC N).
1392* (vi)  If there is any G0 designation in text, there must be a switch to
1393*       ASCII or to JIS X 0201-Roman before a space character (but not
1394*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1395*       characters such as tab or CRLF.
1396* (vi)  Supported encodings:
1397*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1398*
1399*  source : RFC-1554
1400*
1401*          JISX201, JISX208,JISX212 : new .cnv data files created
1402*          KSC5601 : alias to ibm-949 mapping table
1403*          GB2312 : alias to ibm-1386 mapping table
1404*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1405*          ISO-8859-7 : alisas to ibm-9409 mapping table
1406*/
1407
1408/* preference order of JP charsets */
1409static const StateEnum jpCharsetPref[]={
1410    ASCII,
1411    JISX201,
1412    ISO8859_1,
1413    ISO8859_7,
1414    JISX208,
1415    JISX212,
1416    GB2312,
1417    KSC5601,
1418    HWKANA_7BIT
1419};
1420
1421/*
1422 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1423 * not in order of jpCharsetPref[]!
1424 */
1425static const char escSeqChars[][6] ={
1426    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1427    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1428    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1429    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1430    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1431    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1432    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1433    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1434    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1435
1436};
1437static  const int8_t escSeqCharsLen[] ={
1438    3, /* length of <ESC>(B  ASCII       */
1439    3, /* length of <ESC>.A  ISO-8859-1  */
1440    3, /* length of <ESC>.F  ISO-8859-7  */
1441    3, /* length of <ESC>(J  JISX-201    */
1442    3, /* length of <ESC>$B  JISX-208    */
1443    4, /* length of <ESC>$(D JISX-212    */
1444    3, /* length of <ESC>$A  GB2312      */
1445    4, /* length of <ESC>$(C KSC5601     */
1446    3  /* length of <ESC>(I  HWKANA_7BIT */
1447};
1448
1449/*
1450* The iteration over various code pages works this way:
1451* i)   Get the currentState from myConverterData->currentState
1452* ii)  Check if the character is mapped to a valid character in the currentState
1453*      Yes ->  a) set the initIterState to currentState
1454*       b) remain in this state until an invalid character is found
1455*      No  ->  a) go to the next code page and find the character
1456* iii) Before changing the state increment the current state check if the current state
1457*      is equal to the intitIteration state
1458*      Yes ->  A character that cannot be represented in any of the supported encodings
1459*       break and return a U_INVALID_CHARACTER error
1460*      No  ->  Continue and find the character in next code page
1461*
1462*
1463* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1464*/
1465
1466/* Map 00..7F to Unicode according to JIS X 0201. */
1467static inline uint32_t
1468jisx201ToU(uint32_t value) {
1469    if(value < 0x5c) {
1470        return value;
1471    } else if(value == 0x5c) {
1472        return 0xa5;
1473    } else if(value == 0x7e) {
1474        return 0x203e;
1475    } else /* value <= 0x7f */ {
1476        return value;
1477    }
1478}
1479
1480/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1481static inline uint32_t
1482jisx201FromU(uint32_t value) {
1483    if(value<=0x7f) {
1484        if(value!=0x5c && value!=0x7e) {
1485            return value;
1486        }
1487    } else if(value==0xa5) {
1488        return 0x5c;
1489    } else if(value==0x203e) {
1490        return 0x7e;
1491    }
1492    return 0xfffe;
1493}
1494
1495/*
1496 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1497 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1498 * Return 0 if the byte pair is out of range.
1499 */
1500static inline uint32_t
1501_2022FromSJIS(uint32_t value) {
1502    uint8_t trail;
1503
1504    if(value > 0xEFFC) {
1505        return 0;  /* beyond JIS X 0208 */
1506    }
1507
1508    trail = (uint8_t)value;
1509
1510    value &= 0xff00;  /* lead byte */
1511    if(value <= 0x9f00) {
1512        value -= 0x7000;
1513    } else /* 0xe000 <= value <= 0xef00 */ {
1514        value -= 0xb000;
1515    }
1516    value <<= 1;
1517
1518    if(trail <= 0x9e) {
1519        value -= 0x100;
1520        if(trail <= 0x7e) {
1521            value |= trail - 0x1f;
1522        } else {
1523            value |= trail - 0x20;
1524        }
1525    } else /* trail <= 0xfc */ {
1526        value |= trail - 0x7e;
1527    }
1528    return value;
1529}
1530
1531/*
1532 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1533 * If either byte is outside 21..7E make sure that the result is not valid
1534 * for Shift-JIS so that the converter catches it.
1535 * Some invalid byte values already turn into equally invalid Shift-JIS
1536 * byte values and need not be tested explicitly.
1537 */
1538static inline void
1539_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1540    if(c1&1) {
1541        ++c1;
1542        if(c2 <= 0x5f) {
1543            c2 += 0x1f;
1544        } else if(c2 <= 0x7e) {
1545            c2 += 0x20;
1546        } else {
1547            c2 = 0;  /* invalid */
1548        }
1549    } else {
1550        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1551            c2 += 0x7e;
1552        } else {
1553            c2 = 0;  /* invalid */
1554        }
1555    }
1556    c1 >>= 1;
1557    if(c1 <= 0x2f) {
1558        c1 += 0x70;
1559    } else if(c1 <= 0x3f) {
1560        c1 += 0xb0;
1561    } else {
1562        c1 = 0;  /* invalid */
1563    }
1564    bytes[0] = (char)c1;
1565    bytes[1] = (char)c2;
1566}
1567
1568/*
1569 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1570 * Katakana.
1571 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1572 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1573 * These were the only fallbacks in ICU's jisx-208.ucm file.
1574 */
1575static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1576    0x2123,  /* U+FF61 */
1577    0x2156,
1578    0x2157,
1579    0x2122,
1580    0x2126,
1581    0x2572,
1582    0x2521,
1583    0x2523,
1584    0x2525,
1585    0x2527,
1586    0x2529,
1587    0x2563,
1588    0x2565,
1589    0x2567,
1590    0x2543,
1591    0x213C,  /* U+FF70 */
1592    0x2522,
1593    0x2524,
1594    0x2526,
1595    0x2528,
1596    0x252A,
1597    0x252B,
1598    0x252D,
1599    0x252F,
1600    0x2531,
1601    0x2533,
1602    0x2535,
1603    0x2537,
1604    0x2539,
1605    0x253B,
1606    0x253D,
1607    0x253F,  /* U+FF80 */
1608    0x2541,
1609    0x2544,
1610    0x2546,
1611    0x2548,
1612    0x254A,
1613    0x254B,
1614    0x254C,
1615    0x254D,
1616    0x254E,
1617    0x254F,
1618    0x2552,
1619    0x2555,
1620    0x2558,
1621    0x255B,
1622    0x255E,
1623    0x255F,  /* U+FF90 */
1624    0x2560,
1625    0x2561,
1626    0x2562,
1627    0x2564,
1628    0x2566,
1629    0x2568,
1630    0x2569,
1631    0x256A,
1632    0x256B,
1633    0x256C,
1634    0x256D,
1635    0x256F,
1636    0x2573,
1637    0x212B,
1638    0x212C   /* U+FF9F */
1639};
1640
1641static void
1642UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1643    UConverter *cnv = args->converter;
1644    UConverterDataISO2022 *converterData;
1645    ISO2022State *pFromU2022State;
1646    uint8_t *target = (uint8_t *) args->target;
1647    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1648    const UChar* source = args->source;
1649    const UChar* sourceLimit = args->sourceLimit;
1650    int32_t* offsets = args->offsets;
1651    UChar32 sourceChar;
1652    char buffer[8];
1653    int32_t len, outLen;
1654    int8_t choices[10];
1655    int32_t choiceCount;
1656    uint32_t targetValue = 0;
1657    UBool useFallback;
1658
1659    int32_t i;
1660    int8_t cs, g;
1661
1662    /* set up the state */
1663    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1664    pFromU2022State   = &converterData->fromU2022State;
1665
1666    choiceCount = 0;
1667
1668    /* check if the last codepoint of previous buffer was a lead surrogate*/
1669    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1670        goto getTrail;
1671    }
1672
1673    while(source < sourceLimit) {
1674        if(target < targetLimit) {
1675
1676            sourceChar  = *(source++);
1677            /*check if the char is a First surrogate*/
1678            if(U16_IS_SURROGATE(sourceChar)) {
1679                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1680getTrail:
1681                    /*look ahead to find the trail surrogate*/
1682                    if(source < sourceLimit) {
1683                        /* test the following code unit */
1684                        UChar trail=(UChar) *source;
1685                        if(U16_IS_TRAIL(trail)) {
1686                            source++;
1687                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1688                            cnv->fromUChar32=0x00;
1689                            /* convert this supplementary code point */
1690                            /* exit this condition tree */
1691                        } else {
1692                            /* this is an unmatched lead code unit (1st surrogate) */
1693                            /* callback(illegal) */
1694                            *err=U_ILLEGAL_CHAR_FOUND;
1695                            cnv->fromUChar32=sourceChar;
1696                            break;
1697                        }
1698                    } else {
1699                        /* no more input */
1700                        cnv->fromUChar32=sourceChar;
1701                        break;
1702                    }
1703                } else {
1704                    /* this is an unmatched trail code unit (2nd surrogate) */
1705                    /* callback(illegal) */
1706                    *err=U_ILLEGAL_CHAR_FOUND;
1707                    cnv->fromUChar32=sourceChar;
1708                    break;
1709                }
1710            }
1711
1712            /* do not convert SO/SI/ESC */
1713            if(IS_2022_CONTROL(sourceChar)) {
1714                /* callback(illegal) */
1715                *err=U_ILLEGAL_CHAR_FOUND;
1716                cnv->fromUChar32=sourceChar;
1717                break;
1718            }
1719
1720            /* do the conversion */
1721
1722            if(choiceCount == 0) {
1723                uint16_t csm;
1724
1725                /*
1726                 * The csm variable keeps track of which charsets are allowed
1727                 * and not used yet while building the choices[].
1728                 */
1729                csm = jpCharsetMasks[converterData->version];
1730                choiceCount = 0;
1731
1732                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1733                if(converterData->version == 3 || converterData->version == 4) {
1734                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1735                }
1736                /* Do not try single-byte half-width Katakana for other versions. */
1737                csm &= ~CSM(HWKANA_7BIT);
1738
1739                /* try the current G0 charset */
1740                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1741                csm &= ~CSM(cs);
1742
1743                /* try the current G2 charset */
1744                if((cs = pFromU2022State->cs[2]) != 0) {
1745                    choices[choiceCount++] = cs;
1746                    csm &= ~CSM(cs);
1747                }
1748
1749                /* try all the other possible charsets */
1750                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1751                    cs = (int8_t)jpCharsetPref[i];
1752                    if(CSM(cs) & csm) {
1753                        choices[choiceCount++] = cs;
1754                        csm &= ~CSM(cs);
1755                    }
1756                }
1757            }
1758
1759            cs = g = 0;
1760            /*
1761             * len==0: no mapping found yet
1762             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1763             * len>0: found a roundtrip result, done
1764             */
1765            len = 0;
1766            /*
1767             * We will turn off useFallback after finding a fallback,
1768             * but we still get fallbacks from PUA code points as usual.
1769             * Therefore, we will also need to check that we don't overwrite
1770             * an early fallback with a later one.
1771             */
1772            useFallback = cnv->useFallback;
1773
1774            for(i = 0; i < choiceCount && len <= 0; ++i) {
1775                uint32_t value;
1776                int32_t len2;
1777                int8_t cs0 = choices[i];
1778                switch(cs0) {
1779                case ASCII:
1780                    if(sourceChar <= 0x7f) {
1781                        targetValue = (uint32_t)sourceChar;
1782                        len = 1;
1783                        cs = cs0;
1784                        g = 0;
1785                    }
1786                    break;
1787                case ISO8859_1:
1788                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1789                        targetValue = (uint32_t)sourceChar - 0x80;
1790                        len = 1;
1791                        cs = cs0;
1792                        g = 2;
1793                    }
1794                    break;
1795                case HWKANA_7BIT:
1796                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1797                        if(converterData->version==3) {
1798                            /* JIS7: use G1 (SO) */
1799                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1800                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1801                            len = 1;
1802                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1803                            g = 1;
1804                        } else if(converterData->version==4) {
1805                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1806                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1807                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1808                            len = 1;
1809
1810                            cs = pFromU2022State->cs[0];
1811                            if(IS_JP_DBCS(cs)) {
1812                                /* switch from a DBCS charset to JISX201 */
1813                                cs = (int8_t)JISX201;
1814                            }
1815                            /* else stay in the current G0 charset */
1816                            g = 0;
1817                        }
1818                        /* else do not use HWKANA_7BIT with other versions */
1819                    }
1820                    break;
1821                case JISX201:
1822                    /* G0 SBCS */
1823                    value = jisx201FromU(sourceChar);
1824                    if(value <= 0x7f) {
1825                        targetValue = value;
1826                        len = 1;
1827                        cs = cs0;
1828                        g = 0;
1829                        useFallback = FALSE;
1830                    }
1831                    break;
1832                case JISX208:
1833                    /* G0 DBCS from Shift-JIS table */
1834                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1835                                converterData->myConverterArray[cs0],
1836                                sourceChar, &value,
1837                                useFallback, MBCS_OUTPUT_2);
1838                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1839                        value = _2022FromSJIS(value);
1840                        if(value != 0) {
1841                            targetValue = value;
1842                            len = len2;
1843                            cs = cs0;
1844                            g = 0;
1845                            useFallback = FALSE;
1846                        }
1847                    } else if(len == 0 && useFallback &&
1848                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1849                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1850                        len = -2;
1851                        cs = cs0;
1852                        g = 0;
1853                        useFallback = FALSE;
1854                    }
1855                    break;
1856                case ISO8859_7:
1857                    /* G0 SBCS forced to 7-bit output */
1858                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1859                                converterData->myConverterArray[cs0],
1860                                sourceChar, &value,
1861                                useFallback);
1862                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1863                        targetValue = value - 0x80;
1864                        len = len2;
1865                        cs = cs0;
1866                        g = 2;
1867                        useFallback = FALSE;
1868                    }
1869                    break;
1870                default:
1871                    /* G0 DBCS */
1872                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1873                                converterData->myConverterArray[cs0],
1874                                sourceChar, &value,
1875                                useFallback, MBCS_OUTPUT_2);
1876                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1877                        if(cs0 == KSC5601) {
1878                            /*
1879                             * Check for valid bytes for the encoding scheme.
1880                             * This is necessary because the sub-converter (windows-949)
1881                             * has a broader encoding scheme than is valid for 2022.
1882                             */
1883                            value = _2022FromGR94DBCS(value);
1884                            if(value == 0) {
1885                                break;
1886                            }
1887                        }
1888                        targetValue = value;
1889                        len = len2;
1890                        cs = cs0;
1891                        g = 0;
1892                        useFallback = FALSE;
1893                    }
1894                    break;
1895                }
1896            }
1897
1898            if(len != 0) {
1899                if(len < 0) {
1900                    len = -len;  /* fallback */
1901                }
1902                outLen = 0; /* count output bytes */
1903
1904                /* write SI if necessary (only for JIS7) */
1905                if(pFromU2022State->g == 1 && g == 0) {
1906                    buffer[outLen++] = UCNV_SI;
1907                    pFromU2022State->g = 0;
1908                }
1909
1910                /* write the designation sequence if necessary */
1911                if(cs != pFromU2022State->cs[g]) {
1912                    int32_t escLen = escSeqCharsLen[cs];
1913                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1914                    outLen += escLen;
1915                    pFromU2022State->cs[g] = cs;
1916
1917                    /* invalidate the choices[] */
1918                    choiceCount = 0;
1919                }
1920
1921                /* write the shift sequence if necessary */
1922                if(g != pFromU2022State->g) {
1923                    switch(g) {
1924                    /* case 0 handled before writing escapes */
1925                    case 1:
1926                        buffer[outLen++] = UCNV_SO;
1927                        pFromU2022State->g = 1;
1928                        break;
1929                    default: /* case 2 */
1930                        buffer[outLen++] = 0x1b;
1931                        buffer[outLen++] = 0x4e;
1932                        break;
1933                    /* no case 3: no SS3 in ISO-2022-JP-x */
1934                    }
1935                }
1936
1937                /* write the output bytes */
1938                if(len == 1) {
1939                    buffer[outLen++] = (char)targetValue;
1940                } else /* len == 2 */ {
1941                    buffer[outLen++] = (char)(targetValue >> 8);
1942                    buffer[outLen++] = (char)targetValue;
1943                }
1944            } else {
1945                /*
1946                 * if we cannot find the character after checking all codepages
1947                 * then this is an error
1948                 */
1949                *err = U_INVALID_CHAR_FOUND;
1950                cnv->fromUChar32=sourceChar;
1951                break;
1952            }
1953
1954            if(sourceChar == CR || sourceChar == LF) {
1955                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1956                pFromU2022State->cs[2] = 0;
1957                choiceCount = 0;
1958            }
1959
1960            /* output outLen>0 bytes in buffer[] */
1961            if(outLen == 1) {
1962                *target++ = buffer[0];
1963                if(offsets) {
1964                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1965                }
1966            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1967                *target++ = buffer[0];
1968                *target++ = buffer[1];
1969                if(offsets) {
1970                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1971                    *offsets++ = sourceIndex;
1972                    *offsets++ = sourceIndex;
1973                }
1974            } else {
1975                fromUWriteUInt8(
1976                    cnv,
1977                    buffer, outLen,
1978                    &target, (const char *)targetLimit,
1979                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1980                    err);
1981                if(U_FAILURE(*err)) {
1982                    break;
1983                }
1984            }
1985        } /* end if(myTargetIndex<myTargetLength) */
1986        else{
1987            *err =U_BUFFER_OVERFLOW_ERROR;
1988            break;
1989        }
1990
1991    }/* end while(mySourceIndex<mySourceLength) */
1992
1993    /*
1994     * the end of the input stream and detection of truncated input
1995     * are handled by the framework, but for ISO-2022-JP conversion
1996     * we need to be in ASCII mode at the very end
1997     *
1998     * conditions:
1999     *   successful
2000     *   in SO mode or not in ASCII mode
2001     *   end of input and no truncated input
2002     */
2003    if( U_SUCCESS(*err) &&
2004        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2005        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2006    ) {
2007        int32_t sourceIndex;
2008
2009        outLen = 0;
2010
2011        if(pFromU2022State->g != 0) {
2012            buffer[outLen++] = UCNV_SI;
2013            pFromU2022State->g = 0;
2014        }
2015
2016        if(pFromU2022State->cs[0] != ASCII) {
2017            int32_t escLen = escSeqCharsLen[ASCII];
2018            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2019            outLen += escLen;
2020            pFromU2022State->cs[0] = (int8_t)ASCII;
2021        }
2022
2023        /* get the source index of the last input character */
2024        /*
2025         * TODO this would be simpler and more reliable if we used a pair
2026         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2027         * so that we could simply use the prevSourceIndex here;
2028         * this code gives an incorrect result for the rare case of an unmatched
2029         * trail surrogate that is alone in the last buffer of the text stream
2030         */
2031        sourceIndex=(int32_t)(source-args->source);
2032        if(sourceIndex>0) {
2033            --sourceIndex;
2034            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2035                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2036            ) {
2037                --sourceIndex;
2038            }
2039        } else {
2040            sourceIndex=-1;
2041        }
2042
2043        fromUWriteUInt8(
2044            cnv,
2045            buffer, outLen,
2046            &target, (const char *)targetLimit,
2047            &offsets, sourceIndex,
2048            err);
2049    }
2050
2051    /*save the state and return */
2052    args->source = source;
2053    args->target = (char*)target;
2054}
2055
2056/*************** to unicode *******************/
2057
2058static void
2059UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2060                                               UErrorCode* err){
2061    char tempBuf[2];
2062    const char *mySource = (char *) args->source;
2063    UChar *myTarget = args->target;
2064    const char *mySourceLimit = args->sourceLimit;
2065    uint32_t targetUniChar = 0x0000;
2066    uint32_t mySourceChar = 0x0000;
2067    uint32_t tmpSourceChar = 0x0000;
2068    UConverterDataISO2022* myData;
2069    ISO2022State *pToU2022State;
2070    StateEnum cs;
2071
2072    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2073    pToU2022State = &myData->toU2022State;
2074
2075    if(myData->key != 0) {
2076        /* continue with a partial escape sequence */
2077        goto escape;
2078    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2079        /* continue with a partial double-byte character */
2080        mySourceChar = args->converter->toUBytes[0];
2081        args->converter->toULength = 0;
2082        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2083        targetUniChar = missingCharMarker;
2084        goto getTrailByte;
2085    }
2086
2087    while(mySource < mySourceLimit){
2088
2089        targetUniChar =missingCharMarker;
2090
2091        if(myTarget < args->targetLimit){
2092
2093            mySourceChar= (unsigned char) *mySource++;
2094
2095            switch(mySourceChar) {
2096            case UCNV_SI:
2097                if(myData->version==3) {
2098                    pToU2022State->g=0;
2099                    continue;
2100                } else {
2101                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2102                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2103                    break;
2104                }
2105
2106            case UCNV_SO:
2107                if(myData->version==3) {
2108                    /* JIS7: switch to G1 half-width Katakana */
2109                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2110                    pToU2022State->g=1;
2111                    continue;
2112                } else {
2113                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2114                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2115                    break;
2116                }
2117
2118            case ESC_2022:
2119                mySource--;
2120escape:
2121                {
2122                    const char * mySourceBefore = mySource;
2123                    int8_t toULengthBefore = args->converter->toULength;
2124
2125                    changeState_2022(args->converter,&(mySource),
2126                        mySourceLimit, ISO_2022_JP,err);
2127
2128                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2129                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2130                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2131                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2132                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2133                    }
2134                }
2135
2136                /* invalid or illegal escape sequence */
2137                if(U_FAILURE(*err)){
2138                    args->target = myTarget;
2139                    args->source = mySource;
2140                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
2141                    return;
2142                }
2143                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2144                if(myData->key==0) {
2145                    myData->isEmptySegment = TRUE;
2146                }
2147                continue;
2148
2149            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2150
2151            case CR:
2152                /*falls through*/
2153            case LF:
2154                /* automatically reset to single-byte mode */
2155                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2156                    pToU2022State->cs[0] = (int8_t)ASCII;
2157                }
2158                pToU2022State->cs[2] = 0;
2159                pToU2022State->g = 0;
2160                /* falls through */
2161            default:
2162                /* convert one or two bytes */
2163                myData->isEmptySegment = FALSE;
2164                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2165                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2166                    !IS_JP_DBCS(cs)
2167                ) {
2168                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2169                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2170
2171                    /* return from a single-shift state to the previous one */
2172                    if(pToU2022State->g >= 2) {
2173                        pToU2022State->g=pToU2022State->prevG;
2174                    }
2175                } else switch(cs) {
2176                case ASCII:
2177                    if(mySourceChar <= 0x7f) {
2178                        targetUniChar = mySourceChar;
2179                    }
2180                    break;
2181                case ISO8859_1:
2182                    if(mySourceChar <= 0x7f) {
2183                        targetUniChar = mySourceChar + 0x80;
2184                    }
2185                    /* return from a single-shift state to the previous one */
2186                    pToU2022State->g=pToU2022State->prevG;
2187                    break;
2188                case ISO8859_7:
2189                    if(mySourceChar <= 0x7f) {
2190                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2191                        targetUniChar =
2192                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2193                                myData->myConverterArray[cs],
2194                                mySourceChar + 0x80);
2195                    }
2196                    /* return from a single-shift state to the previous one */
2197                    pToU2022State->g=pToU2022State->prevG;
2198                    break;
2199                case JISX201:
2200                    if(mySourceChar <= 0x7f) {
2201                        targetUniChar = jisx201ToU(mySourceChar);
2202                    }
2203                    break;
2204                case HWKANA_7BIT:
2205                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2206                        /* 7-bit halfwidth Katakana */
2207                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2208                    }
2209                    break;
2210                default:
2211                    /* G0 DBCS */
2212                    if(mySource < mySourceLimit) {
2213                        int leadIsOk, trailIsOk;
2214                        uint8_t trailByte;
2215getTrailByte:
2216                        trailByte = (uint8_t)*mySource;
2217                        /*
2218                         * Ticket 5691: consistent illegal sequences:
2219                         * - We include at least the first byte in the illegal sequence.
2220                         * - If any of the non-initial bytes could be the start of a character,
2221                         *   we stop the illegal sequence before the first one of those.
2222                         *
2223                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2224                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2225                         * Otherwise we convert or report the pair of bytes.
2226                         */
2227                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2228                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2229                        if (leadIsOk && trailIsOk) {
2230                            ++mySource;
2231                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2232                            if(cs == JISX208) {
2233                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2234                                mySourceChar = tmpSourceChar;
2235                            } else {
2236                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2237                                mySourceChar = tmpSourceChar;
2238                                if (cs == KSC5601) {
2239                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2240                                }
2241                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2242                                tempBuf[1] = (char)(tmpSourceChar);
2243                            }
2244                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2245                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2246                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2247                            ++mySource;
2248                            /* add another bit so that the code below writes 2 bytes in case of error */
2249                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2250                        }
2251                    } else {
2252                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2253                        args->converter->toULength = 1;
2254                        goto endloop;
2255                    }
2256                }  /* End of inner switch */
2257                break;
2258            }  /* End of outer switch */
2259            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2260                if(args->offsets){
2261                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2262                }
2263                *(myTarget++)=(UChar)targetUniChar;
2264            }
2265            else if(targetUniChar > missingCharMarker){
2266                /* disassemble the surrogate pair and write to output*/
2267                targetUniChar-=0x0010000;
2268                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2269                if(args->offsets){
2270                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2271                }
2272                ++myTarget;
2273                if(myTarget< args->targetLimit){
2274                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2275                    if(args->offsets){
2276                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2277                    }
2278                    ++myTarget;
2279                }else{
2280                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2281                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2282                }
2283
2284            }
2285            else{
2286                /* Call the callback function*/
2287                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2288                break;
2289            }
2290        }
2291        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2292            *err =U_BUFFER_OVERFLOW_ERROR;
2293            break;
2294        }
2295    }
2296endloop:
2297    args->target = myTarget;
2298    args->source = mySource;
2299}
2300
2301
2302/***************************************************************
2303*   Rules for ISO-2022-KR encoding
2304*   i) The KSC5601 designator sequence should appear only once in a file,
2305*      at the begining of a line before any KSC5601 characters. This usually
2306*      means that it appears by itself on the first line of the file
2307*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2308*      and SI to shift into single byte mode
2309*/
2310static void
2311UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2312
2313    UConverter* saveConv = args->converter;
2314    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2315    args->converter=myConverterData->currentConverter;
2316
2317    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2318    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2319    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2320
2321    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2322        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2323            uprv_memcpy(
2324                saveConv->charErrorBuffer,
2325                myConverterData->currentConverter->charErrorBuffer,
2326                myConverterData->currentConverter->charErrorBufferLength);
2327        }
2328        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2329        myConverterData->currentConverter->charErrorBufferLength = 0;
2330    }
2331    args->converter=saveConv;
2332}
2333
2334static void
2335UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2336
2337    const UChar *source = args->source;
2338    const UChar *sourceLimit = args->sourceLimit;
2339    unsigned char *target = (unsigned char *) args->target;
2340    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2341    int32_t* offsets = args->offsets;
2342    uint32_t targetByteUnit = 0x0000;
2343    UChar32 sourceChar = 0x0000;
2344    UBool isTargetByteDBCS;
2345    UBool oldIsTargetByteDBCS;
2346    UConverterDataISO2022 *converterData;
2347    UConverterSharedData* sharedData;
2348    UBool useFallback;
2349    int32_t length =0;
2350
2351    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2352    /* if the version is 1 then the user is requesting
2353     * conversion with ibm-25546 pass the arguments to
2354     * MBCS converter and return
2355     */
2356    if(converterData->version==1){
2357        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2358        return;
2359    }
2360
2361    /* initialize data */
2362    sharedData = converterData->currentConverter->sharedData;
2363    useFallback = args->converter->useFallback;
2364    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2365    oldIsTargetByteDBCS = isTargetByteDBCS;
2366
2367    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2368    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2369        goto getTrail;
2370    }
2371    while(source < sourceLimit){
2372
2373        targetByteUnit = missingCharMarker;
2374
2375        if(target < (unsigned char*) args->targetLimit){
2376            sourceChar = *source++;
2377
2378            /* do not convert SO/SI/ESC */
2379            if(IS_2022_CONTROL(sourceChar)) {
2380                /* callback(illegal) */
2381                *err=U_ILLEGAL_CHAR_FOUND;
2382                args->converter->fromUChar32=sourceChar;
2383                break;
2384            }
2385
2386            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2387            if(length < 0) {
2388                length = -length;  /* fallback */
2389            }
2390            /* only DBCS or SBCS characters are expected*/
2391            /* DB characters with high bit set to 1 are expected */
2392            if( length > 2 || length==0 ||
2393                (length == 1 && targetByteUnit > 0x7f) ||
2394                (length == 2 &&
2395                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2396                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2397            ) {
2398                targetByteUnit=missingCharMarker;
2399            }
2400            if (targetByteUnit != missingCharMarker){
2401
2402                oldIsTargetByteDBCS = isTargetByteDBCS;
2403                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2404                  /* append the shift sequence */
2405                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2406
2407                    if (isTargetByteDBCS)
2408                        *target++ = UCNV_SO;
2409                    else
2410                        *target++ = UCNV_SI;
2411                    if(offsets)
2412                        *(offsets++) = (int32_t)(source - args->source-1);
2413                }
2414                /* write the targetUniChar  to target */
2415                if(targetByteUnit <= 0x00FF){
2416                    if( target < targetLimit){
2417                        *(target++) = (unsigned char) targetByteUnit;
2418                        if(offsets){
2419                            *(offsets++) = (int32_t)(source - args->source-1);
2420                        }
2421
2422                    }else{
2423                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2424                        *err = U_BUFFER_OVERFLOW_ERROR;
2425                    }
2426                }else{
2427                    if(target < targetLimit){
2428                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2429                        if(offsets){
2430                            *(offsets++) = (int32_t)(source - args->source-1);
2431                        }
2432                        if(target < targetLimit){
2433                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2434                            if(offsets){
2435                                *(offsets++) = (int32_t)(source - args->source-1);
2436                            }
2437                        }else{
2438                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2439                            *err = U_BUFFER_OVERFLOW_ERROR;
2440                        }
2441                    }else{
2442                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2443                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2444                        *err = U_BUFFER_OVERFLOW_ERROR;
2445                    }
2446                }
2447
2448            }
2449            else{
2450                /* oops.. the code point is unassingned
2451                 * set the error and reason
2452                 */
2453
2454                /*check if the char is a First surrogate*/
2455                if(U16_IS_SURROGATE(sourceChar)) {
2456                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2457getTrail:
2458                        /*look ahead to find the trail surrogate*/
2459                        if(source <  sourceLimit) {
2460                            /* test the following code unit */
2461                            UChar trail=(UChar) *source;
2462                            if(U16_IS_TRAIL(trail)) {
2463                                source++;
2464                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2465                                *err = U_INVALID_CHAR_FOUND;
2466                                /* convert this surrogate code point */
2467                                /* exit this condition tree */
2468                            } else {
2469                                /* this is an unmatched lead code unit (1st surrogate) */
2470                                /* callback(illegal) */
2471                                *err=U_ILLEGAL_CHAR_FOUND;
2472                            }
2473                        } else {
2474                            /* no more input */
2475                            *err = U_ZERO_ERROR;
2476                        }
2477                    } else {
2478                        /* this is an unmatched trail code unit (2nd surrogate) */
2479                        /* callback(illegal) */
2480                        *err=U_ILLEGAL_CHAR_FOUND;
2481                    }
2482                } else {
2483                    /* callback(unassigned) for a BMP code point */
2484                    *err = U_INVALID_CHAR_FOUND;
2485                }
2486
2487                args->converter->fromUChar32=sourceChar;
2488                break;
2489            }
2490        } /* end if(myTargetIndex<myTargetLength) */
2491        else{
2492            *err =U_BUFFER_OVERFLOW_ERROR;
2493            break;
2494        }
2495
2496    }/* end while(mySourceIndex<mySourceLength) */
2497
2498    /*
2499     * the end of the input stream and detection of truncated input
2500     * are handled by the framework, but for ISO-2022-KR conversion
2501     * we need to be in ASCII mode at the very end
2502     *
2503     * conditions:
2504     *   successful
2505     *   not in ASCII mode
2506     *   end of input and no truncated input
2507     */
2508    if( U_SUCCESS(*err) &&
2509        isTargetByteDBCS &&
2510        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2511    ) {
2512        int32_t sourceIndex;
2513
2514        /* we are switching to ASCII */
2515        isTargetByteDBCS=FALSE;
2516
2517        /* get the source index of the last input character */
2518        /*
2519         * TODO this would be simpler and more reliable if we used a pair
2520         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2521         * so that we could simply use the prevSourceIndex here;
2522         * this code gives an incorrect result for the rare case of an unmatched
2523         * trail surrogate that is alone in the last buffer of the text stream
2524         */
2525        sourceIndex=(int32_t)(source-args->source);
2526        if(sourceIndex>0) {
2527            --sourceIndex;
2528            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2529                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2530            ) {
2531                --sourceIndex;
2532            }
2533        } else {
2534            sourceIndex=-1;
2535        }
2536
2537        fromUWriteUInt8(
2538            args->converter,
2539            SHIFT_IN_STR, 1,
2540            &target, (const char *)targetLimit,
2541            &offsets, sourceIndex,
2542            err);
2543    }
2544
2545    /*save the state and return */
2546    args->source = source;
2547    args->target = (char*)target;
2548    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2549}
2550
2551/************************ To Unicode ***************************************/
2552
2553static void
2554UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2555                                                            UErrorCode* err){
2556    char const* sourceStart;
2557    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2558
2559    UConverterToUnicodeArgs subArgs;
2560    int32_t minArgsSize;
2561
2562    /* set up the subconverter arguments */
2563    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2564        minArgsSize = args->size;
2565    } else {
2566        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2567    }
2568
2569    uprv_memcpy(&subArgs, args, minArgsSize);
2570    subArgs.size = (uint16_t)minArgsSize;
2571    subArgs.converter = myData->currentConverter;
2572
2573    /* remember the original start of the input for offsets */
2574    sourceStart = args->source;
2575
2576    if(myData->key != 0) {
2577        /* continue with a partial escape sequence */
2578        goto escape;
2579    }
2580
2581    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2582        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2583        subArgs.source = args->source;
2584        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2585        if(subArgs.source != subArgs.sourceLimit) {
2586            /*
2587             * get the current partial byte sequence
2588             *
2589             * it needs to be moved between the public and the subconverter
2590             * so that the conversion framework, which only sees the public
2591             * converter, can handle truncated and illegal input etc.
2592             */
2593            if(args->converter->toULength > 0) {
2594                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2595            }
2596            subArgs.converter->toULength = args->converter->toULength;
2597
2598            /*
2599             * Convert up to the end of the input, or to before the next escape character.
2600             * Does not handle conversion extensions because the preToU[] state etc.
2601             * is not copied.
2602             */
2603            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2604
2605            if(args->offsets != NULL && sourceStart != args->source) {
2606                /* update offsets to base them on the actual start of the input */
2607                int32_t *offsets = args->offsets;
2608                UChar *target = args->target;
2609                int32_t delta = (int32_t)(args->source - sourceStart);
2610                while(target < subArgs.target) {
2611                    if(*offsets >= 0) {
2612                        *offsets += delta;
2613                    }
2614                    ++offsets;
2615                    ++target;
2616                }
2617            }
2618            args->source = subArgs.source;
2619            args->target = subArgs.target;
2620            args->offsets = subArgs.offsets;
2621
2622            /* copy input/error/overflow buffers */
2623            if(subArgs.converter->toULength > 0) {
2624                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2625            }
2626            args->converter->toULength = subArgs.converter->toULength;
2627
2628            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2629                if(subArgs.converter->UCharErrorBufferLength > 0) {
2630                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2631                                subArgs.converter->UCharErrorBufferLength);
2632                }
2633                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2634                subArgs.converter->UCharErrorBufferLength = 0;
2635            }
2636        }
2637
2638        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2639            return;
2640        }
2641
2642escape:
2643        changeState_2022(args->converter,
2644               &(args->source),
2645               args->sourceLimit,
2646               ISO_2022_KR,
2647               err);
2648    }
2649}
2650
2651static void
2652UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2653                                                            UErrorCode* err){
2654    char tempBuf[2];
2655    const char *mySource = ( char *) args->source;
2656    UChar *myTarget = args->target;
2657    const char *mySourceLimit = args->sourceLimit;
2658    UChar32 targetUniChar = 0x0000;
2659    UChar mySourceChar = 0x0000;
2660    UConverterDataISO2022* myData;
2661    UConverterSharedData* sharedData ;
2662    UBool useFallback;
2663
2664    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2665    if(myData->version==1){
2666        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2667        return;
2668    }
2669
2670    /* initialize state */
2671    sharedData = myData->currentConverter->sharedData;
2672    useFallback = args->converter->useFallback;
2673
2674    if(myData->key != 0) {
2675        /* continue with a partial escape sequence */
2676        goto escape;
2677    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2678        /* continue with a partial double-byte character */
2679        mySourceChar = args->converter->toUBytes[0];
2680        args->converter->toULength = 0;
2681        goto getTrailByte;
2682    }
2683
2684    while(mySource< mySourceLimit){
2685
2686        if(myTarget < args->targetLimit){
2687
2688            mySourceChar= (unsigned char) *mySource++;
2689
2690            if(mySourceChar==UCNV_SI){
2691                myData->toU2022State.g = 0;
2692                if (myData->isEmptySegment) {
2693                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
2694                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2695                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2696                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2697                    args->converter->toULength = 1;
2698                    args->target = myTarget;
2699                    args->source = mySource;
2700                    return;
2701                }
2702                /*consume the source */
2703                continue;
2704            }else if(mySourceChar==UCNV_SO){
2705                myData->toU2022State.g = 1;
2706                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
2707                /*consume the source */
2708                continue;
2709            }else if(mySourceChar==ESC_2022){
2710                mySource--;
2711escape:
2712                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2713                changeState_2022(args->converter,&(mySource),
2714                                mySourceLimit, ISO_2022_KR, err);
2715                if(U_FAILURE(*err)){
2716                    args->target = myTarget;
2717                    args->source = mySource;
2718                    return;
2719                }
2720                continue;
2721            }
2722
2723            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
2724            if(myData->toU2022State.g == 1) {
2725                if(mySource < mySourceLimit) {
2726                    int leadIsOk, trailIsOk;
2727                    uint8_t trailByte;
2728getTrailByte:
2729                    targetUniChar = missingCharMarker;
2730                    trailByte = (uint8_t)*mySource;
2731                    /*
2732                     * Ticket 5691: consistent illegal sequences:
2733                     * - We include at least the first byte in the illegal sequence.
2734                     * - If any of the non-initial bytes could be the start of a character,
2735                     *   we stop the illegal sequence before the first one of those.
2736                     *
2737                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2738                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2739                     * Otherwise we convert or report the pair of bytes.
2740                     */
2741                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2742                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2743                    if (leadIsOk && trailIsOk) {
2744                        ++mySource;
2745                        tempBuf[0] = (char)(mySourceChar + 0x80);
2746                        tempBuf[1] = (char)(trailByte + 0x80);
2747                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2748                        mySourceChar = (mySourceChar << 8) | trailByte;
2749                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2750                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2751                        ++mySource;
2752                        /* add another bit so that the code below writes 2 bytes in case of error */
2753                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2754                    }
2755                } else {
2756                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2757                    args->converter->toULength = 1;
2758                    break;
2759                }
2760            }
2761            else if(mySourceChar <= 0x7f) {
2762                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2763            } else {
2764                targetUniChar = 0xffff;
2765            }
2766            if(targetUniChar < 0xfffe){
2767                if(args->offsets) {
2768                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2769                }
2770                *(myTarget++)=(UChar)targetUniChar;
2771            }
2772            else {
2773                /* Call the callback function*/
2774                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2775                break;
2776            }
2777        }
2778        else{
2779            *err =U_BUFFER_OVERFLOW_ERROR;
2780            break;
2781        }
2782    }
2783    args->target = myTarget;
2784    args->source = mySource;
2785}
2786
2787/*************************** END ISO2022-KR *********************************/
2788
2789/*************************** ISO-2022-CN *********************************
2790*
2791* Rules for ISO-2022-CN Encoding:
2792* i)   The designator sequence must appear once on a line before any instance
2793*      of character set it designates.
2794* ii)  If two lines contain characters from the same character set, both lines
2795*      must include the designator sequence.
2796* iii) Once the designator sequence is known, a shifting sequence has to be found
2797*      to invoke the  shifting
2798* iv)  All lines start in ASCII and end in ASCII.
2799* v)   Four shifting sequences are employed for this purpose:
2800*
2801*      Sequcence   ASCII Eq    Charsets
2802*      ----------  -------    ---------
2803*      SI           <SI>        US-ASCII
2804*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2805*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2806*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2807*
2808* vi)
2809*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2810*      SS2designator : ESC "$" "*" finalchar_for_SS2
2811*      SS3designator : ESC "$" "+" finalchar_for_SS3
2812*
2813*      ESC $ ) A       Indicates the bytes following SO are Chinese
2814*       characters as defined in GB 2312-80, until
2815*       another SOdesignation appears
2816*
2817*
2818*      ESC $ ) E       Indicates the bytes following SO are as defined
2819*       in ISO-IR-165 (for details, see section 2.1),
2820*       until another SOdesignation appears
2821*
2822*      ESC $ ) G       Indicates the bytes following SO are as defined
2823*       in CNS 11643-plane-1, until another
2824*       SOdesignation appears
2825*
2826*      ESC $ * H       Indicates the two bytes immediately following
2827*       SS2 is a Chinese character as defined in CNS
2828*       11643-plane-2, until another SS2designation
2829*       appears
2830*       (Meaning <ESC>N must preceed every 2 byte
2831*        sequence.)
2832*
2833*      ESC $ + I       Indicates the immediate two bytes following SS3
2834*       is a Chinese character as defined in CNS
2835*       11643-plane-3, until another SS3designation
2836*       appears
2837*       (Meaning <ESC>O must preceed every 2 byte
2838*        sequence.)
2839*
2840*      ESC $ + J       Indicates the immediate two bytes following SS3
2841*       is a Chinese character as defined in CNS
2842*       11643-plane-4, until another SS3designation
2843*       appears
2844*       (In English: <ESC>O must preceed every 2 byte
2845*        sequence.)
2846*
2847*      ESC $ + K       Indicates the immediate two bytes following SS3
2848*       is a Chinese character as defined in CNS
2849*       11643-plane-5, until another SS3designation
2850*       appears
2851*
2852*      ESC $ + L       Indicates the immediate two bytes following SS3
2853*       is a Chinese character as defined in CNS
2854*       11643-plane-6, until another SS3designation
2855*       appears
2856*
2857*      ESC $ + M       Indicates the immediate two bytes following SS3
2858*       is a Chinese character as defined in CNS
2859*       11643-plane-7, until another SS3designation
2860*       appears
2861*
2862*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2863*       has its own designation information before any Chinese characters
2864*       appear
2865*
2866*/
2867
2868/* The following are defined this way to make the strings truly readonly */
2869static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2870static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2871static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2872static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2873static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2874static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2875static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2876static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2877static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2878
2879/********************** ISO2022-CN Data **************************/
2880static const char* const escSeqCharsCN[10] ={
2881        SHIFT_IN_STR,                   /* 0 ASCII */
2882        GB_2312_80_STR,                 /* 1 GB2312_1 */
2883        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2884        CNS_11643_1992_Plane_1_STR,
2885        CNS_11643_1992_Plane_2_STR,
2886        CNS_11643_1992_Plane_3_STR,
2887        CNS_11643_1992_Plane_4_STR,
2888        CNS_11643_1992_Plane_5_STR,
2889        CNS_11643_1992_Plane_6_STR,
2890        CNS_11643_1992_Plane_7_STR
2891};
2892
2893static void
2894UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2895    UConverter *cnv = args->converter;
2896    UConverterDataISO2022 *converterData;
2897    ISO2022State *pFromU2022State;
2898    uint8_t *target = (uint8_t *) args->target;
2899    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2900    const UChar* source = args->source;
2901    const UChar* sourceLimit = args->sourceLimit;
2902    int32_t* offsets = args->offsets;
2903    UChar32 sourceChar;
2904    char buffer[8];
2905    int32_t len;
2906    int8_t choices[3];
2907    int32_t choiceCount;
2908    uint32_t targetValue = 0;
2909    UBool useFallback;
2910
2911    /* set up the state */
2912    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2913    pFromU2022State   = &converterData->fromU2022State;
2914
2915    choiceCount = 0;
2916
2917    /* check if the last codepoint of previous buffer was a lead surrogate*/
2918    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2919        goto getTrail;
2920    }
2921
2922    while( source < sourceLimit){
2923        if(target < targetLimit){
2924
2925            sourceChar  = *(source++);
2926            /*check if the char is a First surrogate*/
2927             if(U16_IS_SURROGATE(sourceChar)) {
2928                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2929getTrail:
2930                    /*look ahead to find the trail surrogate*/
2931                    if(source < sourceLimit) {
2932                        /* test the following code unit */
2933                        UChar trail=(UChar) *source;
2934                        if(U16_IS_TRAIL(trail)) {
2935                            source++;
2936                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2937                            cnv->fromUChar32=0x00;
2938                            /* convert this supplementary code point */
2939                            /* exit this condition tree */
2940                        } else {
2941                            /* this is an unmatched lead code unit (1st surrogate) */
2942                            /* callback(illegal) */
2943                            *err=U_ILLEGAL_CHAR_FOUND;
2944                            cnv->fromUChar32=sourceChar;
2945                            break;
2946                        }
2947                    } else {
2948                        /* no more input */
2949                        cnv->fromUChar32=sourceChar;
2950                        break;
2951                    }
2952                } else {
2953                    /* this is an unmatched trail code unit (2nd surrogate) */
2954                    /* callback(illegal) */
2955                    *err=U_ILLEGAL_CHAR_FOUND;
2956                    cnv->fromUChar32=sourceChar;
2957                    break;
2958                }
2959            }
2960
2961            /* do the conversion */
2962            if(sourceChar <= 0x007f ){
2963                /* do not convert SO/SI/ESC */
2964                if(IS_2022_CONTROL(sourceChar)) {
2965                    /* callback(illegal) */
2966                    *err=U_ILLEGAL_CHAR_FOUND;
2967                    cnv->fromUChar32=sourceChar;
2968                    break;
2969                }
2970
2971                /* US-ASCII */
2972                if(pFromU2022State->g == 0) {
2973                    buffer[0] = (char)sourceChar;
2974                    len = 1;
2975                } else {
2976                    buffer[0] = UCNV_SI;
2977                    buffer[1] = (char)sourceChar;
2978                    len = 2;
2979                    pFromU2022State->g = 0;
2980                    choiceCount = 0;
2981                }
2982                if(sourceChar == CR || sourceChar == LF) {
2983                    /* reset the state at the end of a line */
2984                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2985                    choiceCount = 0;
2986                }
2987            }
2988            else{
2989                /* convert U+0080..U+10ffff */
2990                int32_t i;
2991                int8_t cs, g;
2992
2993                if(choiceCount == 0) {
2994                    /* try the current SO/G1 converter first */
2995                    choices[0] = pFromU2022State->cs[1];
2996
2997                    /* default to GB2312_1 if none is designated yet */
2998                    if(choices[0] == 0) {
2999                        choices[0] = GB2312_1;
3000                    }
3001
3002                    if(converterData->version == 0) {
3003                        /* ISO-2022-CN */
3004
3005                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3006                        if(choices[0] == GB2312_1) {
3007                            choices[1] = (int8_t)CNS_11643_1;
3008                        } else {
3009                            choices[1] = (int8_t)GB2312_1;
3010                        }
3011
3012                        choiceCount = 2;
3013                    } else if (converterData->version == 1) {
3014                        /* ISO-2022-CN-EXT */
3015
3016                        /* try one of the other converters */
3017                        switch(choices[0]) {
3018                        case GB2312_1:
3019                            choices[1] = (int8_t)CNS_11643_1;
3020                            choices[2] = (int8_t)ISO_IR_165;
3021                            break;
3022                        case ISO_IR_165:
3023                            choices[1] = (int8_t)GB2312_1;
3024                            choices[2] = (int8_t)CNS_11643_1;
3025                            break;
3026                        default: /* CNS_11643_x */
3027                            choices[1] = (int8_t)GB2312_1;
3028                            choices[2] = (int8_t)ISO_IR_165;
3029                            break;
3030                        }
3031
3032                        choiceCount = 3;
3033                    } else {
3034                        choices[0] = (int8_t)CNS_11643_1;
3035                        choices[1] = (int8_t)GB2312_1;
3036                    }
3037                }
3038
3039                cs = g = 0;
3040                /*
3041                 * len==0: no mapping found yet
3042                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3043                 * len>0: found a roundtrip result, done
3044                 */
3045                len = 0;
3046                /*
3047                 * We will turn off useFallback after finding a fallback,
3048                 * but we still get fallbacks from PUA code points as usual.
3049                 * Therefore, we will also need to check that we don't overwrite
3050                 * an early fallback with a later one.
3051                 */
3052                useFallback = cnv->useFallback;
3053
3054                for(i = 0; i < choiceCount && len <= 0; ++i) {
3055                    int8_t cs0 = choices[i];
3056                    if(cs0 > 0) {
3057                        uint32_t value;
3058                        int32_t len2;
3059                        if(cs0 >= CNS_11643_0) {
3060                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3061                                        converterData->myConverterArray[CNS_11643],
3062                                        sourceChar,
3063                                        &value,
3064                                        useFallback,
3065                                        MBCS_OUTPUT_3);
3066                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3067                                targetValue = value;
3068                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3069                                if(len2 >= 0) {
3070                                    len = 2;
3071                                } else {
3072                                    len = -2;
3073                                    useFallback = FALSE;
3074                                }
3075                                if(cs == CNS_11643_1) {
3076                                    g = 1;
3077                                } else if(cs == CNS_11643_2) {
3078                                    g = 2;
3079                                } else /* plane 3..7 */ if(converterData->version == 1) {
3080                                    g = 3;
3081                                } else {
3082                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3083                                    len = 0;
3084                                }
3085                            }
3086                        } else {
3087                            /* GB2312_1 or ISO-IR-165 */
3088                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3089                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3090                                        converterData->myConverterArray[cs0],
3091                                        sourceChar,
3092                                        &value,
3093                                        useFallback,
3094                                        MBCS_OUTPUT_2);
3095                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3096                                targetValue = value;
3097                                len = len2;
3098                                cs = cs0;
3099                                g = 1;
3100                                useFallback = FALSE;
3101                            }
3102                        }
3103                    }
3104                }
3105
3106                if(len != 0) {
3107                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3108
3109                    /* write the designation sequence if necessary */
3110                    if(cs != pFromU2022State->cs[g]) {
3111                        if(cs < CNS_11643) {
3112                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3113                        } else {
3114                            U_ASSERT(cs >= CNS_11643_1);
3115                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3116                        }
3117                        len = 4;
3118                        pFromU2022State->cs[g] = cs;
3119                        if(g == 1) {
3120                            /* changing the SO/G1 charset invalidates the choices[] */
3121                            choiceCount = 0;
3122                        }
3123                    }
3124
3125                    /* write the shift sequence if necessary */
3126                    if(g != pFromU2022State->g) {
3127                        switch(g) {
3128                        case 1:
3129                            buffer[len++] = UCNV_SO;
3130
3131                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3132                            pFromU2022State->g = 1;
3133                            break;
3134                        case 2:
3135                            buffer[len++] = 0x1b;
3136                            buffer[len++] = 0x4e;
3137                            break;
3138                        default: /* case 3 */
3139                            buffer[len++] = 0x1b;
3140                            buffer[len++] = 0x4f;
3141                            break;
3142                        }
3143                    }
3144
3145                    /* write the two output bytes */
3146                    buffer[len++] = (char)(targetValue >> 8);
3147                    buffer[len++] = (char)targetValue;
3148                } else {
3149                    /* if we cannot find the character after checking all codepages
3150                     * then this is an error
3151                     */
3152                    *err = U_INVALID_CHAR_FOUND;
3153                    cnv->fromUChar32=sourceChar;
3154                    break;
3155                }
3156            }
3157
3158            /* output len>0 bytes in buffer[] */
3159            if(len == 1) {
3160                *target++ = buffer[0];
3161                if(offsets) {
3162                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3163                }
3164            } else if(len == 2 && (target + 2) <= targetLimit) {
3165                *target++ = buffer[0];
3166                *target++ = buffer[1];
3167                if(offsets) {
3168                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3169                    *offsets++ = sourceIndex;
3170                    *offsets++ = sourceIndex;
3171                }
3172            } else {
3173                fromUWriteUInt8(
3174                    cnv,
3175                    buffer, len,
3176                    &target, (const char *)targetLimit,
3177                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3178                    err);
3179                if(U_FAILURE(*err)) {
3180                    break;
3181                }
3182            }
3183        } /* end if(myTargetIndex<myTargetLength) */
3184        else{
3185            *err =U_BUFFER_OVERFLOW_ERROR;
3186            break;
3187        }
3188
3189    }/* end while(mySourceIndex<mySourceLength) */
3190
3191    /*
3192     * the end of the input stream and detection of truncated input
3193     * are handled by the framework, but for ISO-2022-CN conversion
3194     * we need to be in ASCII mode at the very end
3195     *
3196     * conditions:
3197     *   successful
3198     *   not in ASCII mode
3199     *   end of input and no truncated input
3200     */
3201    if( U_SUCCESS(*err) &&
3202        pFromU2022State->g!=0 &&
3203        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3204    ) {
3205        int32_t sourceIndex;
3206
3207        /* we are switching to ASCII */
3208        pFromU2022State->g=0;
3209
3210        /* get the source index of the last input character */
3211        /*
3212         * TODO this would be simpler and more reliable if we used a pair
3213         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3214         * so that we could simply use the prevSourceIndex here;
3215         * this code gives an incorrect result for the rare case of an unmatched
3216         * trail surrogate that is alone in the last buffer of the text stream
3217         */
3218        sourceIndex=(int32_t)(source-args->source);
3219        if(sourceIndex>0) {
3220            --sourceIndex;
3221            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3222                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3223            ) {
3224                --sourceIndex;
3225            }
3226        } else {
3227            sourceIndex=-1;
3228        }
3229
3230        fromUWriteUInt8(
3231            cnv,
3232            SHIFT_IN_STR, 1,
3233            &target, (const char *)targetLimit,
3234            &offsets, sourceIndex,
3235            err);
3236    }
3237
3238    /*save the state and return */
3239    args->source = source;
3240    args->target = (char*)target;
3241}
3242
3243
3244static void
3245UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3246                                               UErrorCode* err){
3247    char tempBuf[3];
3248    const char *mySource = (char *) args->source;
3249    UChar *myTarget = args->target;
3250    const char *mySourceLimit = args->sourceLimit;
3251    uint32_t targetUniChar = 0x0000;
3252    uint32_t mySourceChar = 0x0000;
3253    UConverterDataISO2022* myData;
3254    ISO2022State *pToU2022State;
3255
3256    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3257    pToU2022State = &myData->toU2022State;
3258
3259    if(myData->key != 0) {
3260        /* continue with a partial escape sequence */
3261        goto escape;
3262    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3263        /* continue with a partial double-byte character */
3264        mySourceChar = args->converter->toUBytes[0];
3265        args->converter->toULength = 0;
3266        targetUniChar = missingCharMarker;
3267        goto getTrailByte;
3268    }
3269
3270    while(mySource < mySourceLimit){
3271
3272        targetUniChar =missingCharMarker;
3273
3274        if(myTarget < args->targetLimit){
3275
3276            mySourceChar= (unsigned char) *mySource++;
3277
3278            switch(mySourceChar){
3279            case UCNV_SI:
3280                pToU2022State->g=0;
3281                if (myData->isEmptySegment) {
3282                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
3283                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3284                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3285                    args->converter->toUBytes[0] = mySourceChar;
3286                    args->converter->toULength = 1;
3287                    args->target = myTarget;
3288                    args->source = mySource;
3289                    return;
3290                }
3291                continue;
3292
3293            case UCNV_SO:
3294                if(pToU2022State->cs[1] != 0) {
3295                    pToU2022State->g=1;
3296                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
3297                    continue;
3298                } else {
3299                    /* illegal to have SO before a matching designator */
3300                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
3301                    break;
3302                }
3303
3304            case ESC_2022:
3305                mySource--;
3306escape:
3307                {
3308                    const char * mySourceBefore = mySource;
3309                    int8_t toULengthBefore = args->converter->toULength;
3310
3311                    changeState_2022(args->converter,&(mySource),
3312                        mySourceLimit, ISO_2022_CN,err);
3313
3314                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3315                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3316                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3317                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3318                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3319                    }
3320                }
3321
3322                /* invalid or illegal escape sequence */
3323                if(U_FAILURE(*err)){
3324                    args->target = myTarget;
3325                    args->source = mySource;
3326                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
3327                    return;
3328                }
3329                continue;
3330
3331            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3332
3333            case CR:
3334                /*falls through*/
3335            case LF:
3336                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3337                /* falls through */
3338            default:
3339                /* convert one or two bytes */
3340                myData->isEmptySegment = FALSE;
3341                if(pToU2022State->g != 0) {
3342                    if(mySource < mySourceLimit) {
3343                        UConverterSharedData *cnv;
3344                        StateEnum tempState;
3345                        int32_t tempBufLen;
3346                        int leadIsOk, trailIsOk;
3347                        uint8_t trailByte;
3348getTrailByte:
3349                        trailByte = (uint8_t)*mySource;
3350                        /*
3351                         * Ticket 5691: consistent illegal sequences:
3352                         * - We include at least the first byte in the illegal sequence.
3353                         * - If any of the non-initial bytes could be the start of a character,
3354                         *   we stop the illegal sequence before the first one of those.
3355                         *
3356                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3357                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3358                         * Otherwise we convert or report the pair of bytes.
3359                         */
3360                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3361                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3362                        if (leadIsOk && trailIsOk) {
3363                            ++mySource;
3364                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3365                            if(tempState >= CNS_11643_0) {
3366                                cnv = myData->myConverterArray[CNS_11643];
3367                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3368                                tempBuf[1] = (char) (mySourceChar);
3369                                tempBuf[2] = (char) trailByte;
3370                                tempBufLen = 3;
3371
3372                            }else{
3373                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3374                                cnv = myData->myConverterArray[tempState];
3375                                tempBuf[0] = (char) (mySourceChar);
3376                                tempBuf[1] = (char) trailByte;
3377                                tempBufLen = 2;
3378                            }
3379                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3380                            mySourceChar = (mySourceChar << 8) | trailByte;
3381                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3382                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3383                            ++mySource;
3384                            /* add another bit so that the code below writes 2 bytes in case of error */
3385                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3386                        }
3387                        if(pToU2022State->g>=2) {
3388                            /* return from a single-shift state to the previous one */
3389                            pToU2022State->g=pToU2022State->prevG;
3390                        }
3391                    } else {
3392                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3393                        args->converter->toULength = 1;
3394                        goto endloop;
3395                    }
3396                }
3397                else{
3398                    if(mySourceChar <= 0x7f) {
3399                        targetUniChar = (UChar) mySourceChar;
3400                    }
3401                }
3402                break;
3403            }
3404            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3405                if(args->offsets){
3406                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3407                }
3408                *(myTarget++)=(UChar)targetUniChar;
3409            }
3410            else if(targetUniChar > missingCharMarker){
3411                /* disassemble the surrogate pair and write to output*/
3412                targetUniChar-=0x0010000;
3413                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3414                if(args->offsets){
3415                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3416                }
3417                ++myTarget;
3418                if(myTarget< args->targetLimit){
3419                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3420                    if(args->offsets){
3421                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3422                    }
3423                    ++myTarget;
3424                }else{
3425                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3426                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3427                }
3428
3429            }
3430            else{
3431                /* Call the callback function*/
3432                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3433                break;
3434            }
3435        }
3436        else{
3437            *err =U_BUFFER_OVERFLOW_ERROR;
3438            break;
3439        }
3440    }
3441endloop:
3442    args->target = myTarget;
3443    args->source = mySource;
3444}
3445
3446static void
3447_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3448    UConverter *cnv = args->converter;
3449    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3450    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3451    char *p, *subchar;
3452    char buffer[8];
3453    int32_t length;
3454
3455    subchar=(char *)cnv->subChars;
3456    length=cnv->subCharLen; /* assume length==1 for most variants */
3457
3458    p = buffer;
3459    switch(myConverterData->locale[0]){
3460    case 'j':
3461        {
3462            int8_t cs;
3463
3464            if(pFromU2022State->g == 1) {
3465                /* JIS7: switch from G1 to G0 */
3466                pFromU2022State->g = 0;
3467                *p++ = UCNV_SI;
3468            }
3469
3470            cs = pFromU2022State->cs[0];
3471            if(cs != ASCII && cs != JISX201) {
3472                /* not in ASCII or JIS X 0201: switch to ASCII */
3473                pFromU2022State->cs[0] = (int8_t)ASCII;
3474                *p++ = '\x1b';
3475                *p++ = '\x28';
3476                *p++ = '\x42';
3477            }
3478
3479            *p++ = subchar[0];
3480            break;
3481        }
3482    case 'c':
3483        if(pFromU2022State->g != 0) {
3484            /* not in ASCII mode: switch to ASCII */
3485            pFromU2022State->g = 0;
3486            *p++ = UCNV_SI;
3487        }
3488        *p++ = subchar[0];
3489        break;
3490    case 'k':
3491        if(myConverterData->version == 0) {
3492            if(length == 1) {
3493                if((UBool)args->converter->fromUnicodeStatus) {
3494                    /* in DBCS mode: switch to SBCS */
3495                    args->converter->fromUnicodeStatus = 0;
3496                    *p++ = UCNV_SI;
3497                }
3498                *p++ = subchar[0];
3499            } else /* length == 2*/ {
3500                if(!(UBool)args->converter->fromUnicodeStatus) {
3501                    /* in SBCS mode: switch to DBCS */
3502                    args->converter->fromUnicodeStatus = 1;
3503                    *p++ = UCNV_SO;
3504                }
3505                *p++ = subchar[0];
3506                *p++ = subchar[1];
3507            }
3508            break;
3509        } else {
3510            /* save the subconverter's substitution string */
3511            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3512            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3513
3514            /* set our substitution string into the subconverter */
3515            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3516            myConverterData->currentConverter->subCharLen = (int8_t)length;
3517
3518            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3519            args->converter = myConverterData->currentConverter;
3520            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3521            ucnv_cbFromUWriteSub(args, 0, err);
3522            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3523            args->converter = cnv;
3524
3525            /* restore the subconverter's substitution string */
3526            myConverterData->currentConverter->subChars = currentSubChars;
3527            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3528
3529            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3530                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3531                    uprv_memcpy(
3532                        cnv->charErrorBuffer,
3533                        myConverterData->currentConverter->charErrorBuffer,
3534                        myConverterData->currentConverter->charErrorBufferLength);
3535                }
3536                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3537                myConverterData->currentConverter->charErrorBufferLength = 0;
3538            }
3539            return;
3540        }
3541    default:
3542        /* not expected */
3543        break;
3544    }
3545    ucnv_cbFromUWriteBytes(args,
3546                           buffer, (int32_t)(p - buffer),
3547                           offsetIndex, err);
3548}
3549
3550/*
3551 * Structure for cloning an ISO 2022 converter into a single memory block.
3552 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3553 * and then ucnv_safeClone() of the sub-converter may additionally align
3554 * currentConverter inside the cloneStruct, for which we need the deadSpace
3555 * after currentConverter.
3556 * This is because UAlignedMemory may be larger than the actually
3557 * necessary alignment size for the platform.
3558 * The other cloneStruct fields will not be moved around,
3559 * and are aligned properly with cloneStruct's alignment.
3560 */
3561struct cloneStruct
3562{
3563    UConverter cnv;
3564    UConverter currentConverter;
3565    UAlignedMemory deadSpace;
3566    UConverterDataISO2022 mydata;
3567};
3568
3569
3570static UConverter *
3571_ISO_2022_SafeClone(
3572            const UConverter *cnv,
3573            void *stackBuffer,
3574            int32_t *pBufferSize,
3575            UErrorCode *status)
3576{
3577    struct cloneStruct * localClone;
3578    UConverterDataISO2022 *cnvData;
3579    int32_t i, size;
3580
3581    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3582        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3583        return NULL;
3584    }
3585
3586    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3587    localClone = (struct cloneStruct *)stackBuffer;
3588
3589    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3590
3591    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3592    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3593    localClone->cnv.isExtraLocal = TRUE;
3594
3595    /* share the subconverters */
3596
3597    if(cnvData->currentConverter != NULL) {
3598        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3599        localClone->mydata.currentConverter =
3600            ucnv_safeClone(cnvData->currentConverter,
3601                            &localClone->currentConverter,
3602                            &size, status);
3603        if(U_FAILURE(*status)) {
3604            return NULL;
3605        }
3606    }
3607
3608    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3609        if(cnvData->myConverterArray[i] != NULL) {
3610            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3611        }
3612    }
3613
3614    return &localClone->cnv;
3615}
3616
3617static void
3618_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3619                    const USetAdder *sa,
3620                    UConverterUnicodeSet which,
3621                    UErrorCode *pErrorCode)
3622{
3623    int32_t i;
3624    UConverterDataISO2022* cnvData;
3625
3626    if (U_FAILURE(*pErrorCode)) {
3627        return;
3628    }
3629#ifdef U_ENABLE_GENERIC_ISO_2022
3630    if (cnv->sharedData == &_ISO2022Data) {
3631        /* We use UTF-8 in this case */
3632        sa->addRange(sa->set, 0, 0xd7FF);
3633        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3634        return;
3635    }
3636#endif
3637
3638    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3639
3640    /* open a set and initialize it with code points that are algorithmically round-tripped */
3641    switch(cnvData->locale[0]){
3642    case 'j':
3643        /* include JIS X 0201 which is hardcoded */
3644        sa->add(sa->set, 0xa5);
3645        sa->add(sa->set, 0x203e);
3646        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3647            /* include Latin-1 for some variants of JP */
3648            sa->addRange(sa->set, 0, 0xff);
3649        } else {
3650            /* include ASCII for JP */
3651            sa->addRange(sa->set, 0, 0x7f);
3652        }
3653        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3654            /*
3655             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3656             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3657             * use half-width Katakana.
3658             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3659             * half-width Katakana via the ESC ( I sequence.
3660             * However, we only emit (fromUnicode) half-width Katakana according to the
3661             * definition of each variant.
3662             *
3663             * When including fallbacks,
3664             * we need to include half-width Katakana Unicode code points for all JP variants because
3665             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3666             */
3667            /* include half-width Katakana for JP */
3668            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3669        }
3670        break;
3671    case 'c':
3672    case 'z':
3673        /* include ASCII for CN */
3674        sa->addRange(sa->set, 0, 0x7f);
3675        break;
3676    case 'k':
3677        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3678        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3679                cnvData->currentConverter, sa, which, pErrorCode);
3680        /* the loop over myConverterArray[] will simply not find another converter */
3681        break;
3682    default:
3683        break;
3684    }
3685
3686#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3687            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3688                cnvData->version==0 && i==CNS_11643
3689            ) {
3690                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3691                ucnv_MBCSGetUnicodeSetForBytes(
3692                        cnvData->myConverterArray[i],
3693                        sa, UCNV_ROUNDTRIP_SET,
3694                        0, 0x81, 0x82,
3695                        pErrorCode);
3696            }
3697#endif
3698
3699    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3700        UConverterSetFilter filter;
3701        if(cnvData->myConverterArray[i]!=NULL) {
3702            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3703                cnvData->version==0 && i==CNS_11643
3704            ) {
3705                /*
3706                 * Version-specific for CN:
3707                 * CN version 0 does not map CNS planes 3..7 although
3708                 * they are all available in the CNS conversion table;
3709                 * CN version 1 (-EXT) does map them all.
3710                 * The two versions create different Unicode sets.
3711                 */
3712                filter=UCNV_SET_FILTER_2022_CN;
3713            } else if(cnvData->locale[0]=='j' && i==JISX208) {
3714                /*
3715                 * Only add code points that map to Shift-JIS codes
3716                 * corresponding to JIS X 0208.
3717                 */
3718                filter=UCNV_SET_FILTER_SJIS;
3719            } else if(i==KSC5601) {
3720                /*
3721                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3722                 * are broader than GR94.
3723                 */
3724                filter=UCNV_SET_FILTER_GR94DBCS;
3725            } else {
3726                filter=UCNV_SET_FILTER_NONE;
3727            }
3728            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3729        }
3730    }
3731
3732    /*
3733     * ISO 2022 converters must not convert SO/SI/ESC despite what
3734     * sub-converters do by themselves.
3735     * Remove these characters from the set.
3736     */
3737    sa->remove(sa->set, 0x0e);
3738    sa->remove(sa->set, 0x0f);
3739    sa->remove(sa->set, 0x1b);
3740
3741    /* ISO 2022 converters do not convert C1 controls either */
3742    sa->removeRange(sa->set, 0x80, 0x9f);
3743}
3744
3745static const UConverterImpl _ISO2022Impl={
3746    UCNV_ISO_2022,
3747
3748    NULL,
3749    NULL,
3750
3751    _ISO2022Open,
3752    _ISO2022Close,
3753    _ISO2022Reset,
3754
3755#ifdef U_ENABLE_GENERIC_ISO_2022
3756    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3757    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3758    ucnv_fromUnicode_UTF8,
3759    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3760#else
3761    NULL,
3762    NULL,
3763    NULL,
3764    NULL,
3765#endif
3766    NULL,
3767
3768    NULL,
3769    _ISO2022getName,
3770    _ISO_2022_WriteSub,
3771    _ISO_2022_SafeClone,
3772    _ISO_2022_GetUnicodeSet,
3773
3774    NULL,
3775    NULL
3776};
3777static const UConverterStaticData _ISO2022StaticData={
3778    sizeof(UConverterStaticData),
3779    "ISO_2022",
3780    2022,
3781    UCNV_IBM,
3782    UCNV_ISO_2022,
3783    1,
3784    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3785    { 0x1a, 0, 0, 0 },
3786    1,
3787    FALSE,
3788    FALSE,
3789    0,
3790    0,
3791    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3792};
3793const UConverterSharedData _ISO2022Data={
3794    sizeof(UConverterSharedData),
3795    ~((uint32_t) 0),
3796    NULL,
3797    NULL,
3798    &_ISO2022StaticData,
3799    FALSE,
3800    &_ISO2022Impl,
3801    0, UCNV_MBCS_TABLE_INITIALIZER
3802};
3803
3804/*************JP****************/
3805static const UConverterImpl _ISO2022JPImpl={
3806    UCNV_ISO_2022,
3807
3808    NULL,
3809    NULL,
3810
3811    _ISO2022Open,
3812    _ISO2022Close,
3813    _ISO2022Reset,
3814
3815    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3816    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3817    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3818    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3819    NULL,
3820
3821    NULL,
3822    _ISO2022getName,
3823    _ISO_2022_WriteSub,
3824    _ISO_2022_SafeClone,
3825    _ISO_2022_GetUnicodeSet,
3826
3827    NULL,
3828    NULL
3829};
3830static const UConverterStaticData _ISO2022JPStaticData={
3831    sizeof(UConverterStaticData),
3832    "ISO_2022_JP",
3833    0,
3834    UCNV_IBM,
3835    UCNV_ISO_2022,
3836    1,
3837    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3838    { 0x1a, 0, 0, 0 },
3839    1,
3840    FALSE,
3841    FALSE,
3842    0,
3843    0,
3844    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3845};
3846
3847namespace {
3848
3849const UConverterSharedData _ISO2022JPData={
3850    sizeof(UConverterSharedData),
3851    ~((uint32_t) 0),
3852    NULL,
3853    NULL,
3854    &_ISO2022JPStaticData,
3855    FALSE,
3856    &_ISO2022JPImpl,
3857    0, UCNV_MBCS_TABLE_INITIALIZER
3858};
3859
3860}  // namespace
3861
3862/************* KR ***************/
3863static const UConverterImpl _ISO2022KRImpl={
3864    UCNV_ISO_2022,
3865
3866    NULL,
3867    NULL,
3868
3869    _ISO2022Open,
3870    _ISO2022Close,
3871    _ISO2022Reset,
3872
3873    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3874    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3875    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3876    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3877    NULL,
3878
3879    NULL,
3880    _ISO2022getName,
3881    _ISO_2022_WriteSub,
3882    _ISO_2022_SafeClone,
3883    _ISO_2022_GetUnicodeSet,
3884
3885    NULL,
3886    NULL
3887};
3888static const UConverterStaticData _ISO2022KRStaticData={
3889    sizeof(UConverterStaticData),
3890    "ISO_2022_KR",
3891    0,
3892    UCNV_IBM,
3893    UCNV_ISO_2022,
3894    1,
3895    3, /* max 3 bytes per UChar: SO+DBCS */
3896    { 0x1a, 0, 0, 0 },
3897    1,
3898    FALSE,
3899    FALSE,
3900    0,
3901    0,
3902    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3903};
3904
3905namespace {
3906
3907const UConverterSharedData _ISO2022KRData={
3908    sizeof(UConverterSharedData),
3909    ~((uint32_t) 0),
3910    NULL,
3911    NULL,
3912    &_ISO2022KRStaticData,
3913    FALSE,
3914    &_ISO2022KRImpl,
3915    0, UCNV_MBCS_TABLE_INITIALIZER
3916};
3917
3918}  // namespace
3919
3920/*************** CN ***************/
3921static const UConverterImpl _ISO2022CNImpl={
3922
3923    UCNV_ISO_2022,
3924
3925    NULL,
3926    NULL,
3927
3928    _ISO2022Open,
3929    _ISO2022Close,
3930    _ISO2022Reset,
3931
3932    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3933    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3934    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936    NULL,
3937
3938    NULL,
3939    _ISO2022getName,
3940    _ISO_2022_WriteSub,
3941    _ISO_2022_SafeClone,
3942    _ISO_2022_GetUnicodeSet,
3943
3944    NULL,
3945    NULL
3946};
3947static const UConverterStaticData _ISO2022CNStaticData={
3948    sizeof(UConverterStaticData),
3949    "ISO_2022_CN",
3950    0,
3951    UCNV_IBM,
3952    UCNV_ISO_2022,
3953    1,
3954    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3955    { 0x1a, 0, 0, 0 },
3956    1,
3957    FALSE,
3958    FALSE,
3959    0,
3960    0,
3961    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3962};
3963
3964namespace {
3965
3966const UConverterSharedData _ISO2022CNData={
3967    sizeof(UConverterSharedData),
3968    ~((uint32_t) 0),
3969    NULL,
3970    NULL,
3971    &_ISO2022CNStaticData,
3972    FALSE,
3973    &_ISO2022CNImpl,
3974    0, UCNV_MBCS_TABLE_INITIALIZER
3975};
3976
3977}  // namespace
3978
3979#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3980