1/*
2**********************************************************************
3*   Copyright (C) 2000-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv2022.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2000feb03
12*   created by: Markus W. Scherer
13*
14*   Change history:
15*
16*   06/29/2000  helena  Major rewrite of the callback APIs.
17*   08/08/2000  Ram     Included support for ISO-2022-JP-2
18*                       Changed implementation of toUnicode
19*                       function
20*   08/21/2000  Ram     Added support for ISO-2022-KR
21*   08/29/2000  Ram     Seperated implementation of EBCDIC to
22*                       ucnvebdc.c
23*   09/20/2000  Ram     Added support for ISO-2022-CN
24*                       Added implementations for getNextUChar()
25*                       for specific 2022 country variants.
26*   10/31/2000  Ram     Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
37#include "unicode/utf16.h"
38#include "ucnv_imp.h"
39#include "ucnv_bld.h"
40#include "ucnv_cnv.h"
41#include "ucnvmbcs.h"
42#include "cstring.h"
43#include "cmemory.h"
44#include "uassert.h"
45
46#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 *    its designation sequences, single shifts with return to the previous state,
56 *    switch-with-no-return to UTF-16BE or similar, etc.
57 *    This is unlike the language-specific variants like ISO-2022-JP which
58 *    require a much smaller repertoire of ISO-2022 features.
59 *    These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 *    but rather always one of the language-specific variants.
62 *    Note that ICU's generic ISO-2022 converter has always output one escape
63 *    sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 *    the previous converter is closed and a new one opened,
66 *    without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 *    reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 *    This means, for example, that when ISO-8859-7 is designated, the following
71 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 *    The ICU ISO-2022 converter does not handle this - and has no information
73 *    about which subconverter would have to be shifted vs. which is designed
74 *    for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
80static const char SHIFT_IN_STR[]  = "\x0F";
81// static const char SHIFT_OUT_STR[] = "\x0E";
82
83#define CR      0x0D
84#define LF      0x0A
85#define H_TAB   0x09
86#define V_TAB   0x0B
87#define SPACE   0x20
88
89enum {
90    HWKANA_START=0xff61,
91    HWKANA_END=0xff9f
92};
93
94/*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102enum {
103    GR94_START=0xa1,
104    GR94_END=0xfe,
105    GR96_START=0xa0,
106    GR96_END=0xff
107};
108
109/*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117/* for ISO-2022-JP and -CN implementations */
118typedef enum  {
119        /* shared values */
120        INVALID_STATE=-1,
121        ASCII = 0,
122
123        SS2_STATE=0x10,
124        SS3_STATE,
125
126        /* JP */
127        ISO8859_1 = 1 ,
128        ISO8859_7 = 2 ,
129        JISX201  = 3,
130        JISX208 = 4,
131        JISX212 = 5,
132        GB2312  =6,
133        KSC5601 =7,
134        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
135
136        /* CN */
137        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138        GB2312_1=1,
139        ISO_IR_165=2,
140        CNS_11643=3,
141
142        /*
143         * these are used in StateEnum and ISO2022State variables,
144         * but CNS_11643 must be used to index into myConverterArray[]
145         */
146        CNS_11643_0=0x20,
147        CNS_11643_1,
148        CNS_11643_2,
149        CNS_11643_3,
150        CNS_11643_4,
151        CNS_11643_5,
152        CNS_11643_6,
153        CNS_11643_7
154} StateEnum;
155
156/* is the StateEnum charset value for a DBCS charset? */
157#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159#define CSM(cs) ((uint16_t)1<<(cs))
160
161/*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 *   all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
170#if UCONFIG_NO_NON_HTML5_CONVERSION
171enum { MAX_JA_VERSION=0 };
172#else
173enum { MAX_JA_VERSION=4 };
174#endif
175static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
176    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
177#if !UCONFIG_NO_NON_HTML5_CONVERSION
178    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
179    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
180    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
181    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
182#endif
183};
184
185typedef enum {
186        ASCII1=0,
187        LATIN1,
188        SBCS,
189        DBCS,
190        MBCS,
191        HWKANA
192}Cnv2022Type;
193
194typedef struct ISO2022State {
195    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
196    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
197    int8_t prevG;       /* g before single shift (SS2 or SS3) */
198} ISO2022State;
199
200#define UCNV_OPTIONS_VERSION_MASK 0xf
201#define UCNV_2022_MAX_CONVERTERS 10
202
203typedef struct{
204    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
205    UConverter *currentConverter;
206    Cnv2022Type currentType;
207    ISO2022State toU2022State, fromU2022State;
208    uint32_t key;
209    uint32_t version;
210#ifdef U_ENABLE_GENERIC_ISO_2022
211    UBool isFirstBuffer;
212#endif
213    UBool isEmptySegment;
214    char name[30];
215    char locale[3];
216}UConverterDataISO2022;
217
218/* Protos */
219/* ISO-2022 ----------------------------------------------------------------- */
220
221/*Forward declaration */
222U_CFUNC void
223ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
224                      UErrorCode * err);
225U_CFUNC void
226ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
227                                    UErrorCode * err);
228
229#define ESC_2022 0x1B /*ESC*/
230
231typedef enum
232{
233        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
234        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
235        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
236        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
237} UCNV_TableStates_2022;
238
239/*
240* The way these state transition arrays work is:
241* ex : ESC$B is the sequence for JISX208
242*      a) First Iteration: char is ESC
243*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
244*             int x = normalize_esq_chars_2022[27] which is equal to 1
245*         ii) Search for this value in escSeqStateTable_Key_2022[]
246*             value of x is stored at escSeqStateTable_Key_2022[0]
247*        iii) Save this index as offset
248*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
249*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
250*     b) Switch on this state and continue to next char
251*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
252*             which is normalize_esq_chars_2022[36] == 4
253*         ii) x is currently 1(from above)
254*               x<<=5 -- x is now 32
255*               x+=normalize_esq_chars_2022[36]
256*               now x is 36
257*        iii) Search for this value in escSeqStateTable_Key_2022[]
258*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
259*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
260*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
261*     c) Switch on this state and continue to next char
262*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
263*        ii) x is currently 36 (from above)
264*            x<<=5 -- x is now 1152
265*            x+=normalize_esq_chars_2022[66]
266*            now x is 1161
267*       iii) Search for this value in escSeqStateTable_Key_2022[]
268*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
269*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
270*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
271*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
272*/
273
274
275/*Below are the 3 arrays depicting a state transition table*/
276static const int8_t normalize_esq_chars_2022[256] = {
277/*       0      1       2       3       4      5       6        7       8       9           */
278
279         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
280        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
281        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
282        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
283        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
284        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
286        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
287        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
288        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304        ,0     ,0      ,0      ,0      ,0      ,0
305};
306
307#ifdef U_ENABLE_GENERIC_ISO_2022
308/*
309 * When the generic ISO-2022 converter is completely removed, not just disabled
310 * per #ifdef, then the following state table and the associated tables that are
311 * dimensioned with MAX_STATES_2022 should be trimmed.
312 *
313 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
314 * the associated escape sequences starting with ESC ( B should be removed.
315 * This includes the ones with key values 1097 and all of the ones above 1000000.
316 *
317 * For the latter, the tables can simply be truncated.
318 * For the former, since the tables must be kept parallel, it is probably best
319 * to simply duplicate an adjacent table cell, parallel in all tables.
320 *
321 * It may make sense to restructure the tables, especially by using small search
322 * tables for the variants instead of indexing them parallel to the table here.
323 */
324#endif
325
326#define MAX_STATES_2022 74
327static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
328/*   0           1           2           3           4           5           6           7           8           9           */
329
330     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
331    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
332    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
333    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
334    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
335    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
336    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
337    ,35947631   ,35947635   ,35947636   ,35947638
338};
339
340#ifdef U_ENABLE_GENERIC_ISO_2022
341
342static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
343 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
344
345     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
346    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
347    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
348    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
349    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
350    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
351    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
352    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
353};
354
355#endif
356
357static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
358/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
359     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
360    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
361    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
362    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
363    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
364    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
365    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367};
368
369
370/* Enable ISO-2022-{KR,CN,CN-Ext} for now.
371 * TODO(jshin): Disable it when we know what to do about 'replacement'
372 * encodings. See http://crbug.com/277037 and
373 * https://codereview.chromium.org/145973021/
374 */
375#ifndef U_ENABLE_ISO_2022_KR_CN
376#define U_ENABLE_ISO_2022_KR_CN 1
377#endif
378
379/* Type def for refactoring changeState_2022 code*/
380typedef enum{
381#ifdef U_ENABLE_GENERIC_ISO_2022
382    ISO_2022=0,
383#endif
384    ISO_2022_JP=1,
385#ifdef U_ENABLE_ISO_2022_KR_CN
386    ISO_2022_KR=2,
387    ISO_2022_CN=3
388#endif
389} Variant2022;
390
391/*********** ISO 2022 Converter Protos ***********/
392static void
393_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
394
395static void
396 _ISO2022Close(UConverter *converter);
397
398static void
399_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
400
401static const char*
402_ISO2022getName(const UConverter* cnv);
403
404static void
405_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
406
407static UConverter *
408_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
409
410#ifdef U_ENABLE_GENERIC_ISO_2022
411static void
412T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
413#endif
414
415namespace {
416
417/*const UConverterSharedData _ISO2022Data;*/
418extern const UConverterSharedData _ISO2022JPData;
419extern const UConverterSharedData _ISO2022KRData;
420extern const UConverterSharedData _ISO2022CNData;
421
422}  // namespace
423
424/*************** Converter implementations ******************/
425
426/* The purpose of this function is to get around gcc compiler warnings. */
427static inline void
428fromUWriteUInt8(UConverter *cnv,
429                 const char *bytes, int32_t length,
430                 uint8_t **target, const char *targetLimit,
431                 int32_t **offsets,
432                 int32_t sourceIndex,
433                 UErrorCode *pErrorCode)
434{
435    char *targetChars = (char *)*target;
436    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
437                         offsets, sourceIndex, pErrorCode);
438    *target = (uint8_t*)targetChars;
439
440}
441
442static inline void
443setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
444    if(myConverterData->version == 1) {
445        UConverter *cnv = myConverterData->currentConverter;
446
447        cnv->toUnicodeStatus=0;     /* offset */
448        cnv->mode=0;                /* state */
449        cnv->toULength=0;           /* byteIndex */
450    }
451}
452
453static inline void
454setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
455   /* in ISO-2022-KR the designator sequence appears only once
456    * in a file so we append it only once
457    */
458    if( converter->charErrorBufferLength==0){
459
460        converter->charErrorBufferLength = 4;
461        converter->charErrorBuffer[0] = 0x1b;
462        converter->charErrorBuffer[1] = 0x24;
463        converter->charErrorBuffer[2] = 0x29;
464        converter->charErrorBuffer[3] = 0x43;
465    }
466    if(myConverterData->version == 1) {
467        UConverter *cnv = myConverterData->currentConverter;
468
469        cnv->fromUChar32=0;
470        cnv->fromUnicodeStatus=1;   /* prevLength */
471    }
472}
473
474static void
475_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
476
477    char myLocale[6]={' ',' ',' ',' ',' ',' '};
478
479    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
480    if(cnv->extraInfo != NULL) {
481        UConverterNamePieces stackPieces;
482        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
483        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
484        uint32_t version;
485
486        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
487
488        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
489        myConverterData->currentType = ASCII1;
490        cnv->fromUnicodeStatus =FALSE;
491        if(pArgs->locale){
492            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
493        }
494        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
495        myConverterData->version = version;
496        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
497            (myLocale[2]=='_' || myLocale[2]=='\0'))
498        {
499            size_t len=0;
500            /* open the required converters and cache them */
501            if(version>MAX_JA_VERSION) {
502                /* prevent indexing beyond jpCharsetMasks[] */
503                myConverterData->version = version = 0;
504            }
505#if !UCONFIG_NO_NON_HTML5_CONVERSION
506            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
507                myConverterData->myConverterArray[ISO8859_7] =
508                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
509            }
510#endif
511            myConverterData->myConverterArray[JISX208] =
512                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
513#if !UCONFIG_NO_NON_HTML5_CONVERSION
514            if(jpCharsetMasks[version]&CSM(JISX212)) {
515                myConverterData->myConverterArray[JISX212] =
516                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
517            }
518            if(jpCharsetMasks[version]&CSM(GB2312)) {
519                myConverterData->myConverterArray[GB2312] =
520                    ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
521            }
522            if(jpCharsetMasks[version]&CSM(KSC5601)) {
523                myConverterData->myConverterArray[KSC5601] =
524                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
525            }
526#endif
527
528            /* set the function pointers to appropriate funtions */
529            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
530            uprv_strcpy(myConverterData->locale,"ja");
531
532            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
533            len = uprv_strlen(myConverterData->name);
534            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
535            myConverterData->name[len+1]='\0';
536        }
537#ifdef U_ENABLE_ISO_2022_KR_CN
538        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
539            (myLocale[2]=='_' || myLocale[2]=='\0'))
540        {
541            const char *cnvName;
542            if(version==1) {
543                cnvName="icu-internal-25546";
544            } else {
545                cnvName="ibm-949";
546                myConverterData->version=version=0;
547            }
548            if(pArgs->onlyTestIsLoadable) {
549                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
550                uprv_free(cnv->extraInfo);
551                cnv->extraInfo=NULL;
552                return;
553            } else {
554                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
555                if (U_FAILURE(*errorCode)) {
556                    _ISO2022Close(cnv);
557                    return;
558                }
559
560                if(version==1) {
561                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
562                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
563                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
564                }else{
565                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
566                }
567
568                /* initialize the state variables */
569                setInitialStateToUnicodeKR(cnv, myConverterData);
570                setInitialStateFromUnicodeKR(cnv, myConverterData);
571
572                /* set the function pointers to appropriate funtions */
573                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
574                uprv_strcpy(myConverterData->locale,"ko");
575            }
576        }
577        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
578            (myLocale[2]=='_' || myLocale[2]=='\0'))
579        {
580
581            /* open the required converters and cache them */
582            myConverterData->myConverterArray[GB2312_1] =
583                ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
584            if(version==1) {
585                myConverterData->myConverterArray[ISO_IR_165] =
586                    ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
587            }
588            myConverterData->myConverterArray[CNS_11643] =
589                ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
590
591
592            /* set the function pointers to appropriate funtions */
593            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
594            uprv_strcpy(myConverterData->locale,"cn");
595
596            if (version==0){
597                myConverterData->version = 0;
598                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
599            }else if (version==1){
600                myConverterData->version = 1;
601                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
602            }else {
603                myConverterData->version = 2;
604                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
605            }
606        }
607#endif // U_ENABLE_ISO_2022_KR_CN
608        else{
609#ifdef U_ENABLE_GENERIC_ISO_2022
610            myConverterData->isFirstBuffer = TRUE;
611
612            /* append the UTF-8 escape sequence */
613            cnv->charErrorBufferLength = 3;
614            cnv->charErrorBuffer[0] = 0x1b;
615            cnv->charErrorBuffer[1] = 0x25;
616            cnv->charErrorBuffer[2] = 0x42;
617
618            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
619            /* initialize the state variables */
620            uprv_strcpy(myConverterData->name,"ISO_2022");
621#else
622            *errorCode = U_UNSUPPORTED_ERROR;
623            return;
624#endif
625        }
626
627        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
628
629        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
630            _ISO2022Close(cnv);
631        }
632    } else {
633        *errorCode = U_MEMORY_ALLOCATION_ERROR;
634    }
635}
636
637
638static void
639_ISO2022Close(UConverter *converter) {
640    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
641    UConverterSharedData **array = myData->myConverterArray;
642    int32_t i;
643
644    if (converter->extraInfo != NULL) {
645        /*close the array of converter pointers and free the memory*/
646        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
647            if(array[i]!=NULL) {
648                ucnv_unloadSharedDataIfReady(array[i]);
649            }
650        }
651
652        ucnv_close(myData->currentConverter);
653
654        if(!converter->isExtraLocal){
655            uprv_free (converter->extraInfo);
656            converter->extraInfo = NULL;
657        }
658    }
659}
660
661static void
662_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
663    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
664    if(choice<=UCNV_RESET_TO_UNICODE) {
665        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
666        myConverterData->key = 0;
667        myConverterData->isEmptySegment = FALSE;
668    }
669    if(choice!=UCNV_RESET_TO_UNICODE) {
670        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
671    }
672#ifdef U_ENABLE_GENERIC_ISO_2022
673    if(myConverterData->locale[0] == 0){
674        if(choice<=UCNV_RESET_TO_UNICODE) {
675            myConverterData->isFirstBuffer = TRUE;
676            myConverterData->key = 0;
677            if (converter->mode == UCNV_SO){
678                ucnv_close (myConverterData->currentConverter);
679                myConverterData->currentConverter=NULL;
680            }
681            converter->mode = UCNV_SI;
682        }
683        if(choice!=UCNV_RESET_TO_UNICODE) {
684            /* re-append UTF-8 escape sequence */
685            converter->charErrorBufferLength = 3;
686            converter->charErrorBuffer[0] = 0x1b;
687            converter->charErrorBuffer[1] = 0x28;
688            converter->charErrorBuffer[2] = 0x42;
689        }
690    }
691    else
692#endif
693    {
694        /* reset the state variables */
695        if(myConverterData->locale[0] == 'k'){
696            if(choice<=UCNV_RESET_TO_UNICODE) {
697                setInitialStateToUnicodeKR(converter, myConverterData);
698            }
699            if(choice!=UCNV_RESET_TO_UNICODE) {
700                setInitialStateFromUnicodeKR(converter, myConverterData);
701            }
702        }
703    }
704}
705
706static const char*
707_ISO2022getName(const UConverter* cnv){
708    if(cnv->extraInfo){
709        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
710        return myData->name;
711    }
712    return NULL;
713}
714
715
716/*************** to unicode *******************/
717/****************************************************************************
718 * Recognized escape sequences are
719 * <ESC>(B  ASCII
720 * <ESC>.A  ISO-8859-1
721 * <ESC>.F  ISO-8859-7
722 * <ESC>(J  JISX-201
723 * <ESC>(I  JISX-201
724 * <ESC>$B  JISX-208
725 * <ESC>$@  JISX-208
726 * <ESC>$(D JISX-212
727 * <ESC>$A  GB2312
728 * <ESC>$(C KSC5601
729 */
730static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
731/*      0                1               2               3               4               5               6               7               8               9    */
732    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
733    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
734    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
735    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
736    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
737    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
738    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
739    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
740};
741
742/*************** to unicode *******************/
743static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
744/*      0                1               2               3               4               5               6               7               8               9    */
745     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
746    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
747    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
748    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
749    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
750    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
751    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
752    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
753};
754
755
756static UCNV_TableStates_2022
757getKey_2022(char c,int32_t* key,int32_t* offset){
758    int32_t togo;
759    int32_t low = 0;
760    int32_t hi = MAX_STATES_2022;
761    int32_t oldmid=0;
762
763    togo = normalize_esq_chars_2022[(uint8_t)c];
764    if(togo == 0) {
765        /* not a valid character anywhere in an escape sequence */
766        *key = 0;
767        *offset = 0;
768        return INVALID_2022;
769    }
770    togo = (*key << 5) + togo;
771
772    while (hi != low)  /*binary search*/{
773
774        register int32_t mid = (hi+low) >> 1; /*Finds median*/
775
776        if (mid == oldmid)
777            break;
778
779        if (escSeqStateTable_Key_2022[mid] > togo){
780            hi = mid;
781        }
782        else if (escSeqStateTable_Key_2022[mid] < togo){
783            low = mid;
784        }
785        else /*we found it*/{
786            *key = togo;
787            *offset = mid;
788            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
789        }
790        oldmid = mid;
791
792    }
793
794    *key = 0;
795    *offset = 0;
796    return INVALID_2022;
797}
798
799/*runs through a state machine to determine the escape sequence - codepage correspondance
800 */
801static void
802changeState_2022(UConverter* _this,
803                const char** source,
804                const char* sourceLimit,
805                Variant2022 var,
806                UErrorCode* err){
807    UCNV_TableStates_2022 value;
808    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
809    uint32_t key = myData2022->key;
810    int32_t offset = 0;
811    int8_t initialToULength = _this->toULength;
812    char c;
813
814    value = VALID_NON_TERMINAL_2022;
815    while (*source < sourceLimit) {
816        c = *(*source)++;
817        _this->toUBytes[_this->toULength++]=(uint8_t)c;
818        value = getKey_2022(c,(int32_t *) &key, &offset);
819
820        switch (value){
821
822        case VALID_NON_TERMINAL_2022 :
823            /* continue with the loop */
824            break;
825
826        case VALID_TERMINAL_2022:
827            key = 0;
828            goto DONE;
829
830        case INVALID_2022:
831            goto DONE;
832
833        case VALID_MAYBE_TERMINAL_2022:
834#ifdef U_ENABLE_GENERIC_ISO_2022
835            /* ESC ( B is ambiguous only for ISO_2022 itself */
836            if(var == ISO_2022) {
837                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
838                _this->toULength = 0;
839
840                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
841
842                /* continue with the loop */
843                value = VALID_NON_TERMINAL_2022;
844                break;
845            } else
846#endif
847            {
848                /* not ISO_2022 itself, finish here */
849                value = VALID_TERMINAL_2022;
850                key = 0;
851                goto DONE;
852            }
853        }
854    }
855
856DONE:
857    myData2022->key = key;
858
859    if (value == VALID_NON_TERMINAL_2022) {
860        /* indicate that the escape sequence is incomplete: key!=0 */
861        return;
862    } else if (value == INVALID_2022 ) {
863        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
864    } else /* value == VALID_TERMINAL_2022 */ {
865        switch(var){
866#ifdef U_ENABLE_GENERIC_ISO_2022
867        case ISO_2022:
868        {
869            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
870            if(chosenConverterName == NULL) {
871                /* SS2 or SS3 */
872                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
873                _this->toUCallbackReason = UCNV_UNASSIGNED;
874                return;
875            }
876
877            _this->mode = UCNV_SI;
878            ucnv_close(myData2022->currentConverter);
879            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
880            if(U_SUCCESS(*err)) {
881                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
882                _this->mode = UCNV_SO;
883            }
884            break;
885        }
886#endif
887        case ISO_2022_JP:
888            {
889                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
890                switch(tempState) {
891                case INVALID_STATE:
892                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
893                    break;
894                case SS2_STATE:
895                    if(myData2022->toU2022State.cs[2]!=0) {
896                        if(myData2022->toU2022State.g<2) {
897                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
898                        }
899                        myData2022->toU2022State.g=2;
900                    } else {
901                        /* illegal to have SS2 before a matching designator */
902                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
903                    }
904                    break;
905                /* case SS3_STATE: not used in ISO-2022-JP-x */
906                case ISO8859_1:
907                case ISO8859_7:
908                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
909                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
910                    } else {
911                        /* G2 charset for SS2 */
912                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
913                    }
914                    break;
915                default:
916                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
917                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
918                    } else {
919                        /* G0 charset */
920                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
921                    }
922                    break;
923                }
924            }
925            break;
926        case ISO_2022_CN:
927            {
928                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
929                switch(tempState) {
930                case INVALID_STATE:
931                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
932                    break;
933                case SS2_STATE:
934                    if(myData2022->toU2022State.cs[2]!=0) {
935                        if(myData2022->toU2022State.g<2) {
936                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
937                        }
938                        myData2022->toU2022State.g=2;
939                    } else {
940                        /* illegal to have SS2 before a matching designator */
941                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
942                    }
943                    break;
944                case SS3_STATE:
945                    if(myData2022->toU2022State.cs[3]!=0) {
946                        if(myData2022->toU2022State.g<2) {
947                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
948                        }
949                        myData2022->toU2022State.g=3;
950                    } else {
951                        /* illegal to have SS3 before a matching designator */
952                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
953                    }
954                    break;
955                case ISO_IR_165:
956                    if(myData2022->version==0) {
957                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
958                        break;
959                    }
960                    /*fall through*/
961                case GB2312_1:
962                    /*fall through*/
963                case CNS_11643_1:
964                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
965                    break;
966                case CNS_11643_2:
967                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
968                    break;
969                default:
970                    /* other CNS 11643 planes */
971                    if(myData2022->version==0) {
972                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
973                    } else {
974                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
975                    }
976                    break;
977                }
978            }
979            break;
980        case ISO_2022_KR:
981            if(offset==0x30){
982                /* nothing to be done, just accept this one escape sequence */
983            } else {
984                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
985            }
986            break;
987
988        default:
989            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
990            break;
991        }
992    }
993    if(U_SUCCESS(*err)) {
994        _this->toULength = 0;
995    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
996        if(_this->toULength>1) {
997            /*
998             * Ticket 5691: consistent illegal sequences:
999             * - We include at least the first byte (ESC) in the illegal sequence.
1000             * - If any of the non-initial bytes could be the start of a character,
1001             *   we stop the illegal sequence before the first one of those.
1002             *   In escape sequences, all following bytes are "printable", that is,
1003             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1004             *   they are valid single/lead bytes.
1005             *   For simplicity, we always only report the initial ESC byte as the
1006             *   illegal sequence and back out all other bytes we looked at.
1007             */
1008            /* Back out some bytes. */
1009            int8_t backOutDistance=_this->toULength-1;
1010            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1011            if(backOutDistance<=bytesFromThisBuffer) {
1012                /* same as initialToULength<=1 */
1013                *source-=backOutDistance;
1014            } else {
1015                /* Back out bytes from the previous buffer: Need to replay them. */
1016                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1017                /* same as -(initialToULength-1) */
1018                /* preToULength is negative! */
1019                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1020                *source-=bytesFromThisBuffer;
1021            }
1022            _this->toULength=1;
1023        }
1024    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1025        _this->toUCallbackReason = UCNV_UNASSIGNED;
1026    }
1027}
1028
1029/*Checks the characters of the buffer against valid 2022 escape sequences
1030*if the match we return a pointer to the initial start of the sequence otherwise
1031*we return sourceLimit
1032*/
1033/*for 2022 looks ahead in the stream
1034 *to determine the longest possible convertible
1035 *data stream
1036 */
1037static inline const char*
1038getEndOfBuffer_2022(const char** source,
1039                   const char* sourceLimit,
1040                   UBool /*flush*/){
1041
1042    const char* mySource = *source;
1043
1044#ifdef U_ENABLE_GENERIC_ISO_2022
1045    if (*source >= sourceLimit)
1046        return sourceLimit;
1047
1048    do{
1049
1050        if (*mySource == ESC_2022){
1051            int8_t i;
1052            int32_t key = 0;
1053            int32_t offset;
1054            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1055
1056            /* Kludge: I could not
1057            * figure out the reason for validating an escape sequence
1058            * twice - once here and once in changeState_2022().
1059            * is it possible to have an ESC character in a ISO2022
1060            * byte stream which is valid in a code page? Is it legal?
1061            */
1062            for (i=0;
1063            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1064            i++) {
1065                value =  getKey_2022(*(mySource+i), &key, &offset);
1066            }
1067            if (value > 0 || *mySource==ESC_2022)
1068                return mySource;
1069
1070            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1071                return sourceLimit;
1072        }
1073    }while (++mySource < sourceLimit);
1074
1075    return sourceLimit;
1076#else
1077    while(mySource < sourceLimit && *mySource != ESC_2022) {
1078        ++mySource;
1079    }
1080    return mySource;
1081#endif
1082}
1083
1084
1085/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1086 * any future change in _MBCSFromUChar32() function should be reflected here.
1087 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1088 */
1089static inline int32_t
1090MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1091                                         UChar32 c,
1092                                         uint32_t* value,
1093                                         UBool useFallback,
1094                                         int outputType)
1095{
1096    const int32_t *cx;
1097    const uint16_t *table;
1098    uint32_t stage2Entry;
1099    uint32_t myValue;
1100    int32_t length;
1101    const uint8_t *p;
1102    /*
1103     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1104     * Use internal version of ucnv_open() that verifies that the new structures are available,
1105     * else U_INTERNAL_PROGRAM_ERROR.
1106     */
1107    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1108    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1109        table=sharedData->mbcs.fromUnicodeTable;
1110        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1111        /* get the bytes and the length for the output */
1112        if(outputType==MBCS_OUTPUT_2){
1113            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1114            if(myValue<=0xff) {
1115                length=1;
1116            } else {
1117                length=2;
1118            }
1119        } else /* outputType==MBCS_OUTPUT_3 */ {
1120            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1121            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1122            if(myValue<=0xff) {
1123                length=1;
1124            } else if(myValue<=0xffff) {
1125                length=2;
1126            } else {
1127                length=3;
1128            }
1129        }
1130        /* is this code point assigned, or do we use fallbacks? */
1131        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1132            /* assigned */
1133            *value=myValue;
1134            return length;
1135        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1136            /*
1137             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1138             * There is no way with this data structure for fallback output
1139             * to be a zero byte.
1140             */
1141            *value=myValue;
1142            return -length;
1143        }
1144    }
1145
1146    cx=sharedData->mbcs.extIndexes;
1147    if(cx!=NULL) {
1148        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1149    }
1150
1151    /* unassigned */
1152    return 0;
1153}
1154
1155/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1156 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1157 * @param retval pointer to output byte
1158 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1159 */
1160static inline int32_t
1161MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1162                                       UChar32 c,
1163                                       uint32_t* retval,
1164                                       UBool useFallback)
1165{
1166    const uint16_t *table;
1167    int32_t value;
1168    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1169    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1170        return 0;
1171    }
1172    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1173    table=sharedData->mbcs.fromUnicodeTable;
1174    /* get the byte for the output */
1175    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1176    /* is this code point assigned, or do we use fallbacks? */
1177    *retval=(uint32_t)(value&0xff);
1178    if(value>=0xf00) {
1179        return 1;  /* roundtrip */
1180    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1181        return -1;  /* fallback taken */
1182    } else {
1183        return 0;  /* no mapping */
1184    }
1185}
1186
1187/*
1188 * Check that the result is a 2-byte value with each byte in the range A1..FE
1189 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1190 * to move it to the ISO 2022 range 21..7E.
1191 * Return 0 if out of range.
1192 */
1193static inline uint32_t
1194_2022FromGR94DBCS(uint32_t value) {
1195    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1196        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1197    ) {
1198        return value - 0x8080;  /* shift down to 21..7e byte range */
1199    } else {
1200        return 0;  /* not valid for ISO 2022 */
1201    }
1202}
1203
1204#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1205/*
1206 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1207 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1208 * unchanged.
1209 */
1210static inline uint32_t
1211_2022ToGR94DBCS(uint32_t value) {
1212    uint32_t returnValue = value + 0x8080;
1213    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1214        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1215        return returnValue;
1216    } else {
1217        return value;
1218    }
1219}
1220#endif
1221
1222#ifdef U_ENABLE_GENERIC_ISO_2022
1223
1224/**********************************************************************************
1225*  ISO-2022 Converter
1226*
1227*
1228*/
1229
1230static void
1231T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1232                                                           UErrorCode* err){
1233    const char* mySourceLimit, *realSourceLimit;
1234    const char* sourceStart;
1235    const UChar* myTargetStart;
1236    UConverter* saveThis;
1237    UConverterDataISO2022* myData;
1238    int8_t length;
1239
1240    saveThis = args->converter;
1241    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1242
1243    realSourceLimit = args->sourceLimit;
1244    while (args->source < realSourceLimit) {
1245        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1246            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1247            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1248
1249            if(args->source < mySourceLimit) {
1250                if(myData->currentConverter==NULL) {
1251                    myData->currentConverter = ucnv_open("ASCII",err);
1252                    if(U_FAILURE(*err)){
1253                        return;
1254                    }
1255
1256                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1257                    saveThis->mode = UCNV_SO;
1258                }
1259
1260                /* convert to before the ESC or until the end of the buffer */
1261                myData->isFirstBuffer=FALSE;
1262                sourceStart = args->source;
1263                myTargetStart = args->target;
1264                args->converter = myData->currentConverter;
1265                ucnv_toUnicode(args->converter,
1266                    &args->target,
1267                    args->targetLimit,
1268                    &args->source,
1269                    mySourceLimit,
1270                    args->offsets,
1271                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1272                    err);
1273                args->converter = saveThis;
1274
1275                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1276                    /* move the overflow buffer */
1277                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1278                    myData->currentConverter->UCharErrorBufferLength = 0;
1279                    if(length > 0) {
1280                        uprv_memcpy(saveThis->UCharErrorBuffer,
1281                                    myData->currentConverter->UCharErrorBuffer,
1282                                    length*U_SIZEOF_UCHAR);
1283                    }
1284                    return;
1285                }
1286
1287                /*
1288                 * At least one of:
1289                 * -Error while converting
1290                 * -Done with entire buffer
1291                 * -Need to write offsets or update the current offset
1292                 *  (leave that up to the code in ucnv.c)
1293                 *
1294                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1295                 */
1296                if (U_FAILURE(*err) ||
1297                    (args->source == realSourceLimit) ||
1298                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1299                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1300                ) {
1301                    /* copy partial or error input for truncated detection and error handling */
1302                    if(U_FAILURE(*err)) {
1303                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1304                        if(length > 0) {
1305                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1306                        }
1307                    } else {
1308                        length = saveThis->toULength = myData->currentConverter->toULength;
1309                        if(length > 0) {
1310                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1311                            if(args->source < mySourceLimit) {
1312                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1313                            }
1314                        }
1315                    }
1316                    return;
1317                }
1318            }
1319        }
1320
1321        sourceStart = args->source;
1322        changeState_2022(args->converter,
1323               &(args->source),
1324               realSourceLimit,
1325               ISO_2022,
1326               err);
1327        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1328            /* let the ucnv.c code update its current offset */
1329            return;
1330        }
1331    }
1332}
1333
1334#endif
1335
1336/*
1337 * To Unicode Callback helper function
1338 */
1339static void
1340toUnicodeCallback(UConverter *cnv,
1341                  const uint32_t sourceChar, const uint32_t targetUniChar,
1342                  UErrorCode* err){
1343    if(sourceChar>0xff){
1344        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1345        cnv->toUBytes[1] = (uint8_t)sourceChar;
1346        cnv->toULength = 2;
1347    }
1348    else{
1349        cnv->toUBytes[0] =(char) sourceChar;
1350        cnv->toULength = 1;
1351    }
1352
1353    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1354        *err = U_INVALID_CHAR_FOUND;
1355    }
1356    else{
1357        *err = U_ILLEGAL_CHAR_FOUND;
1358    }
1359}
1360
1361/**************************************ISO-2022-JP*************************************************/
1362
1363/************************************** IMPORTANT **************************************************
1364* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1365* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1366* The converter iterates over each Unicode codepoint
1367* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1368* processed one char at a time it would make sense to reduce the extra processing a canned converter
1369* would do as far as possible.
1370*
1371* If the implementation of these macros or structure of sharedData struct change in the future, make
1372* sure that ISO-2022 is also changed.
1373***************************************************************************************************
1374*/
1375
1376/***************************************************************************************************
1377* Rules for ISO-2022-jp encoding
1378* (i)   Escape sequences must be fully contained within a line they should not
1379*       span new lines or CRs
1380* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1381*       JIS-Roman character escape sequence should follow before the line terminates
1382* (iii) If the first character on the line is represented by two bytes then a two
1383*       byte character escape sequence should precede it
1384* (iv)  If no escape sequence is encountered then the characters are ASCII
1385* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1386*       and invoked with SS2 (ESC N).
1387* (vi)  If there is any G0 designation in text, there must be a switch to
1388*       ASCII or to JIS X 0201-Roman before a space character (but not
1389*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1390*       characters such as tab or CRLF.
1391* (vi)  Supported encodings:
1392*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1393*
1394*  source : RFC-1554
1395*
1396*          JISX201, JISX208,JISX212 : new .cnv data files created
1397*          KSC5601 : alias to ibm-949 mapping table
1398*          GB2312 : alias to ibm-1386 mapping table
1399*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1400*          ISO-8859-7 : alisas to ibm-9409 mapping table
1401*/
1402
1403/* preference order of JP charsets */
1404static const StateEnum jpCharsetPref[]={
1405    ASCII,
1406    JISX201,
1407    ISO8859_1,
1408    ISO8859_7,
1409    JISX208,
1410    JISX212,
1411    GB2312,
1412    KSC5601,
1413    HWKANA_7BIT
1414};
1415
1416/*
1417 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1418 * not in order of jpCharsetPref[]!
1419 */
1420static const char escSeqChars[][6] ={
1421    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1422    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1423    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1424    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1425    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1426    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1427    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1428    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1429    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1430
1431};
1432static  const int8_t escSeqCharsLen[] ={
1433    3, /* length of <ESC>(B  ASCII       */
1434    3, /* length of <ESC>.A  ISO-8859-1  */
1435    3, /* length of <ESC>.F  ISO-8859-7  */
1436    3, /* length of <ESC>(J  JISX-201    */
1437    3, /* length of <ESC>$B  JISX-208    */
1438    4, /* length of <ESC>$(D JISX-212    */
1439    3, /* length of <ESC>$A  GB2312      */
1440    4, /* length of <ESC>$(C KSC5601     */
1441    3  /* length of <ESC>(I  HWKANA_7BIT */
1442};
1443
1444/*
1445* The iteration over various code pages works this way:
1446* i)   Get the currentState from myConverterData->currentState
1447* ii)  Check if the character is mapped to a valid character in the currentState
1448*      Yes ->  a) set the initIterState to currentState
1449*       b) remain in this state until an invalid character is found
1450*      No  ->  a) go to the next code page and find the character
1451* iii) Before changing the state increment the current state check if the current state
1452*      is equal to the intitIteration state
1453*      Yes ->  A character that cannot be represented in any of the supported encodings
1454*       break and return a U_INVALID_CHARACTER error
1455*      No  ->  Continue and find the character in next code page
1456*
1457*
1458* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1459*/
1460
1461/* Map 00..7F to Unicode according to JIS X 0201. */
1462static inline uint32_t
1463jisx201ToU(uint32_t value) {
1464    if(value < 0x5c) {
1465        return value;
1466    } else if(value == 0x5c) {
1467        return 0xa5;
1468    } else if(value == 0x7e) {
1469        return 0x203e;
1470    } else /* value <= 0x7f */ {
1471        return value;
1472    }
1473}
1474
1475/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1476static inline uint32_t
1477jisx201FromU(uint32_t value) {
1478    if(value<=0x7f) {
1479        if(value!=0x5c && value!=0x7e) {
1480            return value;
1481        }
1482    } else if(value==0xa5) {
1483        return 0x5c;
1484    } else if(value==0x203e) {
1485        return 0x7e;
1486    }
1487    return 0xfffe;
1488}
1489
1490/*
1491 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1492 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1493 * Return 0 if the byte pair is out of range.
1494 */
1495static inline uint32_t
1496_2022FromSJIS(uint32_t value) {
1497    uint8_t trail;
1498
1499    if(value > 0xEFFC) {
1500        return 0;  /* beyond JIS X 0208 */
1501    }
1502
1503    trail = (uint8_t)value;
1504
1505    value &= 0xff00;  /* lead byte */
1506    if(value <= 0x9f00) {
1507        value -= 0x7000;
1508    } else /* 0xe000 <= value <= 0xef00 */ {
1509        value -= 0xb000;
1510    }
1511    value <<= 1;
1512
1513    if(trail <= 0x9e) {
1514        value -= 0x100;
1515        if(trail <= 0x7e) {
1516            value |= trail - 0x1f;
1517        } else {
1518            value |= trail - 0x20;
1519        }
1520    } else /* trail <= 0xfc */ {
1521        value |= trail - 0x7e;
1522    }
1523    return value;
1524}
1525
1526/*
1527 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1528 * If either byte is outside 21..7E make sure that the result is not valid
1529 * for Shift-JIS so that the converter catches it.
1530 * Some invalid byte values already turn into equally invalid Shift-JIS
1531 * byte values and need not be tested explicitly.
1532 */
1533static inline void
1534_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1535    if(c1&1) {
1536        ++c1;
1537        if(c2 <= 0x5f) {
1538            c2 += 0x1f;
1539        } else if(c2 <= 0x7e) {
1540            c2 += 0x20;
1541        } else {
1542            c2 = 0;  /* invalid */
1543        }
1544    } else {
1545        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1546            c2 += 0x7e;
1547        } else {
1548            c2 = 0;  /* invalid */
1549        }
1550    }
1551    c1 >>= 1;
1552    if(c1 <= 0x2f) {
1553        c1 += 0x70;
1554    } else if(c1 <= 0x3f) {
1555        c1 += 0xb0;
1556    } else {
1557        c1 = 0;  /* invalid */
1558    }
1559    bytes[0] = (char)c1;
1560    bytes[1] = (char)c2;
1561}
1562
1563/*
1564 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1565 * Katakana.
1566 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1567 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1568 * These were the only fallbacks in ICU's jisx-208.ucm file.
1569 */
1570static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1571    0x2123,  /* U+FF61 */
1572    0x2156,
1573    0x2157,
1574    0x2122,
1575    0x2126,
1576    0x2572,
1577    0x2521,
1578    0x2523,
1579    0x2525,
1580    0x2527,
1581    0x2529,
1582    0x2563,
1583    0x2565,
1584    0x2567,
1585    0x2543,
1586    0x213C,  /* U+FF70 */
1587    0x2522,
1588    0x2524,
1589    0x2526,
1590    0x2528,
1591    0x252A,
1592    0x252B,
1593    0x252D,
1594    0x252F,
1595    0x2531,
1596    0x2533,
1597    0x2535,
1598    0x2537,
1599    0x2539,
1600    0x253B,
1601    0x253D,
1602    0x253F,  /* U+FF80 */
1603    0x2541,
1604    0x2544,
1605    0x2546,
1606    0x2548,
1607    0x254A,
1608    0x254B,
1609    0x254C,
1610    0x254D,
1611    0x254E,
1612    0x254F,
1613    0x2552,
1614    0x2555,
1615    0x2558,
1616    0x255B,
1617    0x255E,
1618    0x255F,  /* U+FF90 */
1619    0x2560,
1620    0x2561,
1621    0x2562,
1622    0x2564,
1623    0x2566,
1624    0x2568,
1625    0x2569,
1626    0x256A,
1627    0x256B,
1628    0x256C,
1629    0x256D,
1630    0x256F,
1631    0x2573,
1632    0x212B,
1633    0x212C   /* U+FF9F */
1634};
1635
1636static void
1637UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1638    UConverter *cnv = args->converter;
1639    UConverterDataISO2022 *converterData;
1640    ISO2022State *pFromU2022State;
1641    uint8_t *target = (uint8_t *) args->target;
1642    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1643    const UChar* source = args->source;
1644    const UChar* sourceLimit = args->sourceLimit;
1645    int32_t* offsets = args->offsets;
1646    UChar32 sourceChar;
1647    char buffer[8];
1648    int32_t len, outLen;
1649    int8_t choices[10];
1650    int32_t choiceCount;
1651    uint32_t targetValue = 0;
1652    UBool useFallback;
1653
1654    int32_t i;
1655    int8_t cs, g;
1656
1657    /* set up the state */
1658    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1659    pFromU2022State   = &converterData->fromU2022State;
1660
1661    choiceCount = 0;
1662
1663    /* check if the last codepoint of previous buffer was a lead surrogate*/
1664    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1665        goto getTrail;
1666    }
1667
1668    while(source < sourceLimit) {
1669        if(target < targetLimit) {
1670
1671            sourceChar  = *(source++);
1672            /*check if the char is a First surrogate*/
1673            if(U16_IS_SURROGATE(sourceChar)) {
1674                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1675getTrail:
1676                    /*look ahead to find the trail surrogate*/
1677                    if(source < sourceLimit) {
1678                        /* test the following code unit */
1679                        UChar trail=(UChar) *source;
1680                        if(U16_IS_TRAIL(trail)) {
1681                            source++;
1682                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1683                            cnv->fromUChar32=0x00;
1684                            /* convert this supplementary code point */
1685                            /* exit this condition tree */
1686                        } else {
1687                            /* this is an unmatched lead code unit (1st surrogate) */
1688                            /* callback(illegal) */
1689                            *err=U_ILLEGAL_CHAR_FOUND;
1690                            cnv->fromUChar32=sourceChar;
1691                            break;
1692                        }
1693                    } else {
1694                        /* no more input */
1695                        cnv->fromUChar32=sourceChar;
1696                        break;
1697                    }
1698                } else {
1699                    /* this is an unmatched trail code unit (2nd surrogate) */
1700                    /* callback(illegal) */
1701                    *err=U_ILLEGAL_CHAR_FOUND;
1702                    cnv->fromUChar32=sourceChar;
1703                    break;
1704                }
1705            }
1706
1707            /* do not convert SO/SI/ESC */
1708            if(IS_2022_CONTROL(sourceChar)) {
1709                /* callback(illegal) */
1710                *err=U_ILLEGAL_CHAR_FOUND;
1711                cnv->fromUChar32=sourceChar;
1712                break;
1713            }
1714
1715            /* do the conversion */
1716
1717            if(choiceCount == 0) {
1718                uint16_t csm;
1719
1720                /*
1721                 * The csm variable keeps track of which charsets are allowed
1722                 * and not used yet while building the choices[].
1723                 */
1724                csm = jpCharsetMasks[converterData->version];
1725                choiceCount = 0;
1726
1727                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1728                if(converterData->version == 3 || converterData->version == 4) {
1729                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1730                }
1731                /* Do not try single-byte half-width Katakana for other versions. */
1732                csm &= ~CSM(HWKANA_7BIT);
1733
1734                /* try the current G0 charset */
1735                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1736                csm &= ~CSM(cs);
1737
1738                /* try the current G2 charset */
1739                if((cs = pFromU2022State->cs[2]) != 0) {
1740                    choices[choiceCount++] = cs;
1741                    csm &= ~CSM(cs);
1742                }
1743
1744                /* try all the other possible charsets */
1745                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1746                    cs = (int8_t)jpCharsetPref[i];
1747                    if(CSM(cs) & csm) {
1748                        choices[choiceCount++] = cs;
1749                        csm &= ~CSM(cs);
1750                    }
1751                }
1752            }
1753
1754            cs = g = 0;
1755            /*
1756             * len==0: no mapping found yet
1757             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1758             * len>0: found a roundtrip result, done
1759             */
1760            len = 0;
1761            /*
1762             * We will turn off useFallback after finding a fallback,
1763             * but we still get fallbacks from PUA code points as usual.
1764             * Therefore, we will also need to check that we don't overwrite
1765             * an early fallback with a later one.
1766             */
1767            useFallback = cnv->useFallback;
1768
1769            for(i = 0; i < choiceCount && len <= 0; ++i) {
1770                uint32_t value;
1771                int32_t len2;
1772                int8_t cs0 = choices[i];
1773                switch(cs0) {
1774                case ASCII:
1775                    if(sourceChar <= 0x7f) {
1776                        targetValue = (uint32_t)sourceChar;
1777                        len = 1;
1778                        cs = cs0;
1779                        g = 0;
1780                    }
1781                    break;
1782                case ISO8859_1:
1783                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1784                        targetValue = (uint32_t)sourceChar - 0x80;
1785                        len = 1;
1786                        cs = cs0;
1787                        g = 2;
1788                    }
1789                    break;
1790                case HWKANA_7BIT:
1791                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1792                        if(converterData->version==3) {
1793                            /* JIS7: use G1 (SO) */
1794                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1795                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1796                            len = 1;
1797                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1798                            g = 1;
1799                        } else if(converterData->version==4) {
1800                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1801                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1802                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1803                            len = 1;
1804
1805                            cs = pFromU2022State->cs[0];
1806                            if(IS_JP_DBCS(cs)) {
1807                                /* switch from a DBCS charset to JISX201 */
1808                                cs = (int8_t)JISX201;
1809                            }
1810                            /* else stay in the current G0 charset */
1811                            g = 0;
1812                        }
1813                        /* else do not use HWKANA_7BIT with other versions */
1814                    }
1815                    break;
1816                case JISX201:
1817                    /* G0 SBCS */
1818                    value = jisx201FromU(sourceChar);
1819                    if(value <= 0x7f) {
1820                        targetValue = value;
1821                        len = 1;
1822                        cs = cs0;
1823                        g = 0;
1824                        useFallback = FALSE;
1825                    }
1826                    break;
1827                case JISX208:
1828                    /* G0 DBCS from Shift-JIS table */
1829                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1830                                converterData->myConverterArray[cs0],
1831                                sourceChar, &value,
1832                                useFallback, MBCS_OUTPUT_2);
1833                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1834                        value = _2022FromSJIS(value);
1835                        if(value != 0) {
1836                            targetValue = value;
1837                            len = len2;
1838                            cs = cs0;
1839                            g = 0;
1840                            useFallback = FALSE;
1841                        }
1842                    } else if(len == 0 && useFallback &&
1843                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1844                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1845                        len = -2;
1846                        cs = cs0;
1847                        g = 0;
1848                        useFallback = FALSE;
1849                    }
1850                    break;
1851                case ISO8859_7:
1852                    /* G0 SBCS forced to 7-bit output */
1853                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1854                                converterData->myConverterArray[cs0],
1855                                sourceChar, &value,
1856                                useFallback);
1857                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1858                        targetValue = value - 0x80;
1859                        len = len2;
1860                        cs = cs0;
1861                        g = 2;
1862                        useFallback = FALSE;
1863                    }
1864                    break;
1865                default:
1866                    /* G0 DBCS */
1867                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1868                                converterData->myConverterArray[cs0],
1869                                sourceChar, &value,
1870                                useFallback, MBCS_OUTPUT_2);
1871                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1872                        if(cs0 == KSC5601) {
1873                            /*
1874                             * Check for valid bytes for the encoding scheme.
1875                             * This is necessary because the sub-converter (windows-949)
1876                             * has a broader encoding scheme than is valid for 2022.
1877                             */
1878                            value = _2022FromGR94DBCS(value);
1879                            if(value == 0) {
1880                                break;
1881                            }
1882                        }
1883                        targetValue = value;
1884                        len = len2;
1885                        cs = cs0;
1886                        g = 0;
1887                        useFallback = FALSE;
1888                    }
1889                    break;
1890                }
1891            }
1892
1893            if(len != 0) {
1894                if(len < 0) {
1895                    len = -len;  /* fallback */
1896                }
1897                outLen = 0; /* count output bytes */
1898
1899                /* write SI if necessary (only for JIS7) */
1900                if(pFromU2022State->g == 1 && g == 0) {
1901                    buffer[outLen++] = UCNV_SI;
1902                    pFromU2022State->g = 0;
1903                }
1904
1905                /* write the designation sequence if necessary */
1906                if(cs != pFromU2022State->cs[g]) {
1907                    int32_t escLen = escSeqCharsLen[cs];
1908                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1909                    outLen += escLen;
1910                    pFromU2022State->cs[g] = cs;
1911
1912                    /* invalidate the choices[] */
1913                    choiceCount = 0;
1914                }
1915
1916                /* write the shift sequence if necessary */
1917                if(g != pFromU2022State->g) {
1918                    switch(g) {
1919                    /* case 0 handled before writing escapes */
1920                    case 1:
1921                        buffer[outLen++] = UCNV_SO;
1922                        pFromU2022State->g = 1;
1923                        break;
1924                    default: /* case 2 */
1925                        buffer[outLen++] = 0x1b;
1926                        buffer[outLen++] = 0x4e;
1927                        break;
1928                    /* no case 3: no SS3 in ISO-2022-JP-x */
1929                    }
1930                }
1931
1932                /* write the output bytes */
1933                if(len == 1) {
1934                    buffer[outLen++] = (char)targetValue;
1935                } else /* len == 2 */ {
1936                    buffer[outLen++] = (char)(targetValue >> 8);
1937                    buffer[outLen++] = (char)targetValue;
1938                }
1939            } else {
1940                /*
1941                 * if we cannot find the character after checking all codepages
1942                 * then this is an error
1943                 */
1944                *err = U_INVALID_CHAR_FOUND;
1945                cnv->fromUChar32=sourceChar;
1946                break;
1947            }
1948
1949            if(sourceChar == CR || sourceChar == LF) {
1950                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1951                pFromU2022State->cs[2] = 0;
1952                choiceCount = 0;
1953            }
1954
1955            /* output outLen>0 bytes in buffer[] */
1956            if(outLen == 1) {
1957                *target++ = buffer[0];
1958                if(offsets) {
1959                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1960                }
1961            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1962                *target++ = buffer[0];
1963                *target++ = buffer[1];
1964                if(offsets) {
1965                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1966                    *offsets++ = sourceIndex;
1967                    *offsets++ = sourceIndex;
1968                }
1969            } else {
1970                fromUWriteUInt8(
1971                    cnv,
1972                    buffer, outLen,
1973                    &target, (const char *)targetLimit,
1974                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1975                    err);
1976                if(U_FAILURE(*err)) {
1977                    break;
1978                }
1979            }
1980        } /* end if(myTargetIndex<myTargetLength) */
1981        else{
1982            *err =U_BUFFER_OVERFLOW_ERROR;
1983            break;
1984        }
1985
1986    }/* end while(mySourceIndex<mySourceLength) */
1987
1988    /*
1989     * the end of the input stream and detection of truncated input
1990     * are handled by the framework, but for ISO-2022-JP conversion
1991     * we need to be in ASCII mode at the very end
1992     *
1993     * conditions:
1994     *   successful
1995     *   in SO mode or not in ASCII mode
1996     *   end of input and no truncated input
1997     */
1998    if( U_SUCCESS(*err) &&
1999        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2000        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2001    ) {
2002        int32_t sourceIndex;
2003
2004        outLen = 0;
2005
2006        if(pFromU2022State->g != 0) {
2007            buffer[outLen++] = UCNV_SI;
2008            pFromU2022State->g = 0;
2009        }
2010
2011        if(pFromU2022State->cs[0] != ASCII) {
2012            int32_t escLen = escSeqCharsLen[ASCII];
2013            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2014            outLen += escLen;
2015            pFromU2022State->cs[0] = (int8_t)ASCII;
2016        }
2017
2018        /* get the source index of the last input character */
2019        /*
2020         * TODO this would be simpler and more reliable if we used a pair
2021         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2022         * so that we could simply use the prevSourceIndex here;
2023         * this code gives an incorrect result for the rare case of an unmatched
2024         * trail surrogate that is alone in the last buffer of the text stream
2025         */
2026        sourceIndex=(int32_t)(source-args->source);
2027        if(sourceIndex>0) {
2028            --sourceIndex;
2029            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2030                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2031            ) {
2032                --sourceIndex;
2033            }
2034        } else {
2035            sourceIndex=-1;
2036        }
2037
2038        fromUWriteUInt8(
2039            cnv,
2040            buffer, outLen,
2041            &target, (const char *)targetLimit,
2042            &offsets, sourceIndex,
2043            err);
2044    }
2045
2046    /*save the state and return */
2047    args->source = source;
2048    args->target = (char*)target;
2049}
2050
2051/*************** to unicode *******************/
2052
2053static void
2054UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2055                                               UErrorCode* err){
2056    char tempBuf[2];
2057    const char *mySource = (char *) args->source;
2058    UChar *myTarget = args->target;
2059    const char *mySourceLimit = args->sourceLimit;
2060    uint32_t targetUniChar = 0x0000;
2061    uint32_t mySourceChar = 0x0000;
2062    uint32_t tmpSourceChar = 0x0000;
2063    UConverterDataISO2022* myData;
2064    ISO2022State *pToU2022State;
2065    StateEnum cs;
2066
2067    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2068    pToU2022State = &myData->toU2022State;
2069
2070    if(myData->key != 0) {
2071        /* continue with a partial escape sequence */
2072        goto escape;
2073    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2074        /* continue with a partial double-byte character */
2075        mySourceChar = args->converter->toUBytes[0];
2076        args->converter->toULength = 0;
2077        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2078        targetUniChar = missingCharMarker;
2079        goto getTrailByte;
2080    }
2081
2082    while(mySource < mySourceLimit){
2083
2084        targetUniChar =missingCharMarker;
2085
2086        if(myTarget < args->targetLimit){
2087
2088            mySourceChar= (unsigned char) *mySource++;
2089
2090            switch(mySourceChar) {
2091            case UCNV_SI:
2092                if(myData->version==3) {
2093                    pToU2022State->g=0;
2094                    continue;
2095                } else {
2096                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2097                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2098                    break;
2099                }
2100
2101            case UCNV_SO:
2102                if(myData->version==3) {
2103                    /* JIS7: switch to G1 half-width Katakana */
2104                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2105                    pToU2022State->g=1;
2106                    continue;
2107                } else {
2108                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2109                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2110                    break;
2111                }
2112
2113            case ESC_2022:
2114                mySource--;
2115escape:
2116                {
2117                    const char * mySourceBefore = mySource;
2118                    int8_t toULengthBefore = args->converter->toULength;
2119
2120                    changeState_2022(args->converter,&(mySource),
2121                        mySourceLimit, ISO_2022_JP,err);
2122
2123                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2124                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2125                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2126                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2127                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2128                    }
2129                }
2130
2131                /* invalid or illegal escape sequence */
2132                if(U_FAILURE(*err)){
2133                    args->target = myTarget;
2134                    args->source = mySource;
2135                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
2136                    return;
2137                }
2138                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2139                if(myData->key==0) {
2140                    myData->isEmptySegment = TRUE;
2141                }
2142                continue;
2143
2144            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2145
2146            case CR:
2147                /*falls through*/
2148            case LF:
2149                /* automatically reset to single-byte mode */
2150                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2151                    pToU2022State->cs[0] = (int8_t)ASCII;
2152                }
2153                pToU2022State->cs[2] = 0;
2154                pToU2022State->g = 0;
2155                /* falls through */
2156            default:
2157                /* convert one or two bytes */
2158                myData->isEmptySegment = FALSE;
2159                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2160                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2161                    !IS_JP_DBCS(cs)
2162                ) {
2163                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2164                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2165
2166                    /* return from a single-shift state to the previous one */
2167                    if(pToU2022State->g >= 2) {
2168                        pToU2022State->g=pToU2022State->prevG;
2169                    }
2170                } else switch(cs) {
2171                case ASCII:
2172                    if(mySourceChar <= 0x7f) {
2173                        targetUniChar = mySourceChar;
2174                    }
2175                    break;
2176                case ISO8859_1:
2177                    if(mySourceChar <= 0x7f) {
2178                        targetUniChar = mySourceChar + 0x80;
2179                    }
2180                    /* return from a single-shift state to the previous one */
2181                    pToU2022State->g=pToU2022State->prevG;
2182                    break;
2183                case ISO8859_7:
2184                    if(mySourceChar <= 0x7f) {
2185                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2186                        targetUniChar =
2187                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2188                                myData->myConverterArray[cs],
2189                                mySourceChar + 0x80);
2190                    }
2191                    /* return from a single-shift state to the previous one */
2192                    pToU2022State->g=pToU2022State->prevG;
2193                    break;
2194                case JISX201:
2195                    if(mySourceChar <= 0x7f) {
2196                        targetUniChar = jisx201ToU(mySourceChar);
2197                    }
2198                    break;
2199                case HWKANA_7BIT:
2200                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2201                        /* 7-bit halfwidth Katakana */
2202                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2203                    }
2204                    break;
2205                default:
2206                    /* G0 DBCS */
2207                    if(mySource < mySourceLimit) {
2208                        int leadIsOk, trailIsOk;
2209                        uint8_t trailByte;
2210getTrailByte:
2211                        trailByte = (uint8_t)*mySource;
2212                        /*
2213                         * Ticket 5691: consistent illegal sequences:
2214                         * - We include at least the first byte in the illegal sequence.
2215                         * - If any of the non-initial bytes could be the start of a character,
2216                         *   we stop the illegal sequence before the first one of those.
2217                         *
2218                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2219                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2220                         * Otherwise we convert or report the pair of bytes.
2221                         */
2222                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2223                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2224                        if (leadIsOk && trailIsOk) {
2225                            ++mySource;
2226                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2227                            if(cs == JISX208) {
2228                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2229                                mySourceChar = tmpSourceChar;
2230                            } else {
2231                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2232                                mySourceChar = tmpSourceChar;
2233                                if (cs == KSC5601) {
2234                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2235                                }
2236                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2237                                tempBuf[1] = (char)(tmpSourceChar);
2238                            }
2239                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2240                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2241                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2242                            ++mySource;
2243                            /* add another bit so that the code below writes 2 bytes in case of error */
2244                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2245                        }
2246                    } else {
2247                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2248                        args->converter->toULength = 1;
2249                        goto endloop;
2250                    }
2251                }  /* End of inner switch */
2252                break;
2253            }  /* End of outer switch */
2254            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2255                if(args->offsets){
2256                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2257                }
2258                *(myTarget++)=(UChar)targetUniChar;
2259            }
2260            else if(targetUniChar > missingCharMarker){
2261                /* disassemble the surrogate pair and write to output*/
2262                targetUniChar-=0x0010000;
2263                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2264                if(args->offsets){
2265                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2266                }
2267                ++myTarget;
2268                if(myTarget< args->targetLimit){
2269                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2270                    if(args->offsets){
2271                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2272                    }
2273                    ++myTarget;
2274                }else{
2275                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2276                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2277                }
2278
2279            }
2280            else{
2281                /* Call the callback function*/
2282                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2283                break;
2284            }
2285        }
2286        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2287            *err =U_BUFFER_OVERFLOW_ERROR;
2288            break;
2289        }
2290    }
2291endloop:
2292    args->target = myTarget;
2293    args->source = mySource;
2294}
2295
2296
2297/***************************************************************
2298*   Rules for ISO-2022-KR encoding
2299*   i) The KSC5601 designator sequence should appear only once in a file,
2300*      at the begining of a line before any KSC5601 characters. This usually
2301*      means that it appears by itself on the first line of the file
2302*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2303*      and SI to shift into single byte mode
2304*/
2305static void
2306UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2307
2308    UConverter* saveConv = args->converter;
2309    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2310    args->converter=myConverterData->currentConverter;
2311
2312    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2313    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2314    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2315
2316    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2317        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2318            uprv_memcpy(
2319                saveConv->charErrorBuffer,
2320                myConverterData->currentConverter->charErrorBuffer,
2321                myConverterData->currentConverter->charErrorBufferLength);
2322        }
2323        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2324        myConverterData->currentConverter->charErrorBufferLength = 0;
2325    }
2326    args->converter=saveConv;
2327}
2328
2329static void
2330UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2331
2332    const UChar *source = args->source;
2333    const UChar *sourceLimit = args->sourceLimit;
2334    unsigned char *target = (unsigned char *) args->target;
2335    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2336    int32_t* offsets = args->offsets;
2337    uint32_t targetByteUnit = 0x0000;
2338    UChar32 sourceChar = 0x0000;
2339    UBool isTargetByteDBCS;
2340    UBool oldIsTargetByteDBCS;
2341    UConverterDataISO2022 *converterData;
2342    UConverterSharedData* sharedData;
2343    UBool useFallback;
2344    int32_t length =0;
2345
2346    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2347    /* if the version is 1 then the user is requesting
2348     * conversion with ibm-25546 pass the arguments to
2349     * MBCS converter and return
2350     */
2351    if(converterData->version==1){
2352        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2353        return;
2354    }
2355
2356    /* initialize data */
2357    sharedData = converterData->currentConverter->sharedData;
2358    useFallback = args->converter->useFallback;
2359    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2360    oldIsTargetByteDBCS = isTargetByteDBCS;
2361
2362    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2363    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2364        goto getTrail;
2365    }
2366    while(source < sourceLimit){
2367
2368        targetByteUnit = missingCharMarker;
2369
2370        if(target < (unsigned char*) args->targetLimit){
2371            sourceChar = *source++;
2372
2373            /* do not convert SO/SI/ESC */
2374            if(IS_2022_CONTROL(sourceChar)) {
2375                /* callback(illegal) */
2376                *err=U_ILLEGAL_CHAR_FOUND;
2377                args->converter->fromUChar32=sourceChar;
2378                break;
2379            }
2380
2381            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2382            if(length < 0) {
2383                length = -length;  /* fallback */
2384            }
2385            /* only DBCS or SBCS characters are expected*/
2386            /* DB characters with high bit set to 1 are expected */
2387            if( length > 2 || length==0 ||
2388                (length == 1 && targetByteUnit > 0x7f) ||
2389                (length == 2 &&
2390                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2391                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2392            ) {
2393                targetByteUnit=missingCharMarker;
2394            }
2395            if (targetByteUnit != missingCharMarker){
2396
2397                oldIsTargetByteDBCS = isTargetByteDBCS;
2398                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2399                  /* append the shift sequence */
2400                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2401
2402                    if (isTargetByteDBCS)
2403                        *target++ = UCNV_SO;
2404                    else
2405                        *target++ = UCNV_SI;
2406                    if(offsets)
2407                        *(offsets++) = (int32_t)(source - args->source-1);
2408                }
2409                /* write the targetUniChar  to target */
2410                if(targetByteUnit <= 0x00FF){
2411                    if( target < targetLimit){
2412                        *(target++) = (unsigned char) targetByteUnit;
2413                        if(offsets){
2414                            *(offsets++) = (int32_t)(source - args->source-1);
2415                        }
2416
2417                    }else{
2418                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2419                        *err = U_BUFFER_OVERFLOW_ERROR;
2420                    }
2421                }else{
2422                    if(target < targetLimit){
2423                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2424                        if(offsets){
2425                            *(offsets++) = (int32_t)(source - args->source-1);
2426                        }
2427                        if(target < targetLimit){
2428                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2429                            if(offsets){
2430                                *(offsets++) = (int32_t)(source - args->source-1);
2431                            }
2432                        }else{
2433                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2434                            *err = U_BUFFER_OVERFLOW_ERROR;
2435                        }
2436                    }else{
2437                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2438                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2439                        *err = U_BUFFER_OVERFLOW_ERROR;
2440                    }
2441                }
2442
2443            }
2444            else{
2445                /* oops.. the code point is unassingned
2446                 * set the error and reason
2447                 */
2448
2449                /*check if the char is a First surrogate*/
2450                if(U16_IS_SURROGATE(sourceChar)) {
2451                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2452getTrail:
2453                        /*look ahead to find the trail surrogate*/
2454                        if(source <  sourceLimit) {
2455                            /* test the following code unit */
2456                            UChar trail=(UChar) *source;
2457                            if(U16_IS_TRAIL(trail)) {
2458                                source++;
2459                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2460                                *err = U_INVALID_CHAR_FOUND;
2461                                /* convert this surrogate code point */
2462                                /* exit this condition tree */
2463                            } else {
2464                                /* this is an unmatched lead code unit (1st surrogate) */
2465                                /* callback(illegal) */
2466                                *err=U_ILLEGAL_CHAR_FOUND;
2467                            }
2468                        } else {
2469                            /* no more input */
2470                            *err = U_ZERO_ERROR;
2471                        }
2472                    } else {
2473                        /* this is an unmatched trail code unit (2nd surrogate) */
2474                        /* callback(illegal) */
2475                        *err=U_ILLEGAL_CHAR_FOUND;
2476                    }
2477                } else {
2478                    /* callback(unassigned) for a BMP code point */
2479                    *err = U_INVALID_CHAR_FOUND;
2480                }
2481
2482                args->converter->fromUChar32=sourceChar;
2483                break;
2484            }
2485        } /* end if(myTargetIndex<myTargetLength) */
2486        else{
2487            *err =U_BUFFER_OVERFLOW_ERROR;
2488            break;
2489        }
2490
2491    }/* end while(mySourceIndex<mySourceLength) */
2492
2493    /*
2494     * the end of the input stream and detection of truncated input
2495     * are handled by the framework, but for ISO-2022-KR conversion
2496     * we need to be in ASCII mode at the very end
2497     *
2498     * conditions:
2499     *   successful
2500     *   not in ASCII mode
2501     *   end of input and no truncated input
2502     */
2503    if( U_SUCCESS(*err) &&
2504        isTargetByteDBCS &&
2505        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2506    ) {
2507        int32_t sourceIndex;
2508
2509        /* we are switching to ASCII */
2510        isTargetByteDBCS=FALSE;
2511
2512        /* get the source index of the last input character */
2513        /*
2514         * TODO this would be simpler and more reliable if we used a pair
2515         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2516         * so that we could simply use the prevSourceIndex here;
2517         * this code gives an incorrect result for the rare case of an unmatched
2518         * trail surrogate that is alone in the last buffer of the text stream
2519         */
2520        sourceIndex=(int32_t)(source-args->source);
2521        if(sourceIndex>0) {
2522            --sourceIndex;
2523            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2524                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2525            ) {
2526                --sourceIndex;
2527            }
2528        } else {
2529            sourceIndex=-1;
2530        }
2531
2532        fromUWriteUInt8(
2533            args->converter,
2534            SHIFT_IN_STR, 1,
2535            &target, (const char *)targetLimit,
2536            &offsets, sourceIndex,
2537            err);
2538    }
2539
2540    /*save the state and return */
2541    args->source = source;
2542    args->target = (char*)target;
2543    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2544}
2545
2546/************************ To Unicode ***************************************/
2547
2548static void
2549UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2550                                                            UErrorCode* err){
2551    char const* sourceStart;
2552    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2553
2554    UConverterToUnicodeArgs subArgs;
2555    int32_t minArgsSize;
2556
2557    /* set up the subconverter arguments */
2558    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2559        minArgsSize = args->size;
2560    } else {
2561        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2562    }
2563
2564    uprv_memcpy(&subArgs, args, minArgsSize);
2565    subArgs.size = (uint16_t)minArgsSize;
2566    subArgs.converter = myData->currentConverter;
2567
2568    /* remember the original start of the input for offsets */
2569    sourceStart = args->source;
2570
2571    if(myData->key != 0) {
2572        /* continue with a partial escape sequence */
2573        goto escape;
2574    }
2575
2576    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2577        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2578        subArgs.source = args->source;
2579        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2580        if(subArgs.source != subArgs.sourceLimit) {
2581            /*
2582             * get the current partial byte sequence
2583             *
2584             * it needs to be moved between the public and the subconverter
2585             * so that the conversion framework, which only sees the public
2586             * converter, can handle truncated and illegal input etc.
2587             */
2588            if(args->converter->toULength > 0) {
2589                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2590            }
2591            subArgs.converter->toULength = args->converter->toULength;
2592
2593            /*
2594             * Convert up to the end of the input, or to before the next escape character.
2595             * Does not handle conversion extensions because the preToU[] state etc.
2596             * is not copied.
2597             */
2598            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2599
2600            if(args->offsets != NULL && sourceStart != args->source) {
2601                /* update offsets to base them on the actual start of the input */
2602                int32_t *offsets = args->offsets;
2603                UChar *target = args->target;
2604                int32_t delta = (int32_t)(args->source - sourceStart);
2605                while(target < subArgs.target) {
2606                    if(*offsets >= 0) {
2607                        *offsets += delta;
2608                    }
2609                    ++offsets;
2610                    ++target;
2611                }
2612            }
2613            args->source = subArgs.source;
2614            args->target = subArgs.target;
2615            args->offsets = subArgs.offsets;
2616
2617            /* copy input/error/overflow buffers */
2618            if(subArgs.converter->toULength > 0) {
2619                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2620            }
2621            args->converter->toULength = subArgs.converter->toULength;
2622
2623            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2624                if(subArgs.converter->UCharErrorBufferLength > 0) {
2625                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2626                                subArgs.converter->UCharErrorBufferLength);
2627                }
2628                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2629                subArgs.converter->UCharErrorBufferLength = 0;
2630            }
2631        }
2632
2633        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2634            return;
2635        }
2636
2637escape:
2638        changeState_2022(args->converter,
2639               &(args->source),
2640               args->sourceLimit,
2641               ISO_2022_KR,
2642               err);
2643    }
2644}
2645
2646static void
2647UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2648                                                            UErrorCode* err){
2649    char tempBuf[2];
2650    const char *mySource = ( char *) args->source;
2651    UChar *myTarget = args->target;
2652    const char *mySourceLimit = args->sourceLimit;
2653    UChar32 targetUniChar = 0x0000;
2654    UChar mySourceChar = 0x0000;
2655    UConverterDataISO2022* myData;
2656    UConverterSharedData* sharedData ;
2657    UBool useFallback;
2658
2659    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2660    if(myData->version==1){
2661        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2662        return;
2663    }
2664
2665    /* initialize state */
2666    sharedData = myData->currentConverter->sharedData;
2667    useFallback = args->converter->useFallback;
2668
2669    if(myData->key != 0) {
2670        /* continue with a partial escape sequence */
2671        goto escape;
2672    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2673        /* continue with a partial double-byte character */
2674        mySourceChar = args->converter->toUBytes[0];
2675        args->converter->toULength = 0;
2676        goto getTrailByte;
2677    }
2678
2679    while(mySource< mySourceLimit){
2680
2681        if(myTarget < args->targetLimit){
2682
2683            mySourceChar= (unsigned char) *mySource++;
2684
2685            if(mySourceChar==UCNV_SI){
2686                myData->toU2022State.g = 0;
2687                if (myData->isEmptySegment) {
2688                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
2689                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2690                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2691                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2692                    args->converter->toULength = 1;
2693                    args->target = myTarget;
2694                    args->source = mySource;
2695                    return;
2696                }
2697                /*consume the source */
2698                continue;
2699            }else if(mySourceChar==UCNV_SO){
2700                myData->toU2022State.g = 1;
2701                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
2702                /*consume the source */
2703                continue;
2704            }else if(mySourceChar==ESC_2022){
2705                mySource--;
2706escape:
2707                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2708                changeState_2022(args->converter,&(mySource),
2709                                mySourceLimit, ISO_2022_KR, err);
2710                if(U_FAILURE(*err)){
2711                    args->target = myTarget;
2712                    args->source = mySource;
2713                    return;
2714                }
2715                continue;
2716            }
2717
2718            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
2719            if(myData->toU2022State.g == 1) {
2720                if(mySource < mySourceLimit) {
2721                    int leadIsOk, trailIsOk;
2722                    uint8_t trailByte;
2723getTrailByte:
2724                    targetUniChar = missingCharMarker;
2725                    trailByte = (uint8_t)*mySource;
2726                    /*
2727                     * Ticket 5691: consistent illegal sequences:
2728                     * - We include at least the first byte in the illegal sequence.
2729                     * - If any of the non-initial bytes could be the start of a character,
2730                     *   we stop the illegal sequence before the first one of those.
2731                     *
2732                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2733                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2734                     * Otherwise we convert or report the pair of bytes.
2735                     */
2736                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2737                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2738                    if (leadIsOk && trailIsOk) {
2739                        ++mySource;
2740                        tempBuf[0] = (char)(mySourceChar + 0x80);
2741                        tempBuf[1] = (char)(trailByte + 0x80);
2742                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2743                        mySourceChar = (mySourceChar << 8) | trailByte;
2744                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2745                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2746                        ++mySource;
2747                        /* add another bit so that the code below writes 2 bytes in case of error */
2748                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2749                    }
2750                } else {
2751                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2752                    args->converter->toULength = 1;
2753                    break;
2754                }
2755            }
2756            else if(mySourceChar <= 0x7f) {
2757                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2758            } else {
2759                targetUniChar = 0xffff;
2760            }
2761            if(targetUniChar < 0xfffe){
2762                if(args->offsets) {
2763                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2764                }
2765                *(myTarget++)=(UChar)targetUniChar;
2766            }
2767            else {
2768                /* Call the callback function*/
2769                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2770                break;
2771            }
2772        }
2773        else{
2774            *err =U_BUFFER_OVERFLOW_ERROR;
2775            break;
2776        }
2777    }
2778    args->target = myTarget;
2779    args->source = mySource;
2780}
2781
2782/*************************** END ISO2022-KR *********************************/
2783
2784/*************************** ISO-2022-CN *********************************
2785*
2786* Rules for ISO-2022-CN Encoding:
2787* i)   The designator sequence must appear once on a line before any instance
2788*      of character set it designates.
2789* ii)  If two lines contain characters from the same character set, both lines
2790*      must include the designator sequence.
2791* iii) Once the designator sequence is known, a shifting sequence has to be found
2792*      to invoke the  shifting
2793* iv)  All lines start in ASCII and end in ASCII.
2794* v)   Four shifting sequences are employed for this purpose:
2795*
2796*      Sequcence   ASCII Eq    Charsets
2797*      ----------  -------    ---------
2798*      SI           <SI>        US-ASCII
2799*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2800*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2801*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2802*
2803* vi)
2804*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2805*      SS2designator : ESC "$" "*" finalchar_for_SS2
2806*      SS3designator : ESC "$" "+" finalchar_for_SS3
2807*
2808*      ESC $ ) A       Indicates the bytes following SO are Chinese
2809*       characters as defined in GB 2312-80, until
2810*       another SOdesignation appears
2811*
2812*
2813*      ESC $ ) E       Indicates the bytes following SO are as defined
2814*       in ISO-IR-165 (for details, see section 2.1),
2815*       until another SOdesignation appears
2816*
2817*      ESC $ ) G       Indicates the bytes following SO are as defined
2818*       in CNS 11643-plane-1, until another
2819*       SOdesignation appears
2820*
2821*      ESC $ * H       Indicates the two bytes immediately following
2822*       SS2 is a Chinese character as defined in CNS
2823*       11643-plane-2, until another SS2designation
2824*       appears
2825*       (Meaning <ESC>N must preceed every 2 byte
2826*        sequence.)
2827*
2828*      ESC $ + I       Indicates the immediate two bytes following SS3
2829*       is a Chinese character as defined in CNS
2830*       11643-plane-3, until another SS3designation
2831*       appears
2832*       (Meaning <ESC>O must preceed every 2 byte
2833*        sequence.)
2834*
2835*      ESC $ + J       Indicates the immediate two bytes following SS3
2836*       is a Chinese character as defined in CNS
2837*       11643-plane-4, until another SS3designation
2838*       appears
2839*       (In English: <ESC>O must preceed every 2 byte
2840*        sequence.)
2841*
2842*      ESC $ + K       Indicates the immediate two bytes following SS3
2843*       is a Chinese character as defined in CNS
2844*       11643-plane-5, until another SS3designation
2845*       appears
2846*
2847*      ESC $ + L       Indicates the immediate two bytes following SS3
2848*       is a Chinese character as defined in CNS
2849*       11643-plane-6, until another SS3designation
2850*       appears
2851*
2852*      ESC $ + M       Indicates the immediate two bytes following SS3
2853*       is a Chinese character as defined in CNS
2854*       11643-plane-7, until another SS3designation
2855*       appears
2856*
2857*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2858*       has its own designation information before any Chinese characters
2859*       appear
2860*
2861*/
2862
2863/* The following are defined this way to make the strings truly readonly */
2864static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2865static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2866static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2867static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2868static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2869static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2870static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2871static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2872static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2873
2874/********************** ISO2022-CN Data **************************/
2875static const char* const escSeqCharsCN[10] ={
2876        SHIFT_IN_STR,                   /* 0 ASCII */
2877        GB_2312_80_STR,                 /* 1 GB2312_1 */
2878        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2879        CNS_11643_1992_Plane_1_STR,
2880        CNS_11643_1992_Plane_2_STR,
2881        CNS_11643_1992_Plane_3_STR,
2882        CNS_11643_1992_Plane_4_STR,
2883        CNS_11643_1992_Plane_5_STR,
2884        CNS_11643_1992_Plane_6_STR,
2885        CNS_11643_1992_Plane_7_STR
2886};
2887
2888static void
2889UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2890    UConverter *cnv = args->converter;
2891    UConverterDataISO2022 *converterData;
2892    ISO2022State *pFromU2022State;
2893    uint8_t *target = (uint8_t *) args->target;
2894    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2895    const UChar* source = args->source;
2896    const UChar* sourceLimit = args->sourceLimit;
2897    int32_t* offsets = args->offsets;
2898    UChar32 sourceChar;
2899    char buffer[8];
2900    int32_t len;
2901    int8_t choices[3];
2902    int32_t choiceCount;
2903    uint32_t targetValue = 0;
2904    UBool useFallback;
2905
2906    /* set up the state */
2907    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2908    pFromU2022State   = &converterData->fromU2022State;
2909
2910    choiceCount = 0;
2911
2912    /* check if the last codepoint of previous buffer was a lead surrogate*/
2913    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2914        goto getTrail;
2915    }
2916
2917    while( source < sourceLimit){
2918        if(target < targetLimit){
2919
2920            sourceChar  = *(source++);
2921            /*check if the char is a First surrogate*/
2922             if(U16_IS_SURROGATE(sourceChar)) {
2923                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2924getTrail:
2925                    /*look ahead to find the trail surrogate*/
2926                    if(source < sourceLimit) {
2927                        /* test the following code unit */
2928                        UChar trail=(UChar) *source;
2929                        if(U16_IS_TRAIL(trail)) {
2930                            source++;
2931                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2932                            cnv->fromUChar32=0x00;
2933                            /* convert this supplementary code point */
2934                            /* exit this condition tree */
2935                        } else {
2936                            /* this is an unmatched lead code unit (1st surrogate) */
2937                            /* callback(illegal) */
2938                            *err=U_ILLEGAL_CHAR_FOUND;
2939                            cnv->fromUChar32=sourceChar;
2940                            break;
2941                        }
2942                    } else {
2943                        /* no more input */
2944                        cnv->fromUChar32=sourceChar;
2945                        break;
2946                    }
2947                } else {
2948                    /* this is an unmatched trail code unit (2nd surrogate) */
2949                    /* callback(illegal) */
2950                    *err=U_ILLEGAL_CHAR_FOUND;
2951                    cnv->fromUChar32=sourceChar;
2952                    break;
2953                }
2954            }
2955
2956            /* do the conversion */
2957            if(sourceChar <= 0x007f ){
2958                /* do not convert SO/SI/ESC */
2959                if(IS_2022_CONTROL(sourceChar)) {
2960                    /* callback(illegal) */
2961                    *err=U_ILLEGAL_CHAR_FOUND;
2962                    cnv->fromUChar32=sourceChar;
2963                    break;
2964                }
2965
2966                /* US-ASCII */
2967                if(pFromU2022State->g == 0) {
2968                    buffer[0] = (char)sourceChar;
2969                    len = 1;
2970                } else {
2971                    buffer[0] = UCNV_SI;
2972                    buffer[1] = (char)sourceChar;
2973                    len = 2;
2974                    pFromU2022State->g = 0;
2975                    choiceCount = 0;
2976                }
2977                if(sourceChar == CR || sourceChar == LF) {
2978                    /* reset the state at the end of a line */
2979                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2980                    choiceCount = 0;
2981                }
2982            }
2983            else{
2984                /* convert U+0080..U+10ffff */
2985                int32_t i;
2986                int8_t cs, g;
2987
2988                if(choiceCount == 0) {
2989                    /* try the current SO/G1 converter first */
2990                    choices[0] = pFromU2022State->cs[1];
2991
2992                    /* default to GB2312_1 if none is designated yet */
2993                    if(choices[0] == 0) {
2994                        choices[0] = GB2312_1;
2995                    }
2996
2997                    if(converterData->version == 0) {
2998                        /* ISO-2022-CN */
2999
3000                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3001                        if(choices[0] == GB2312_1) {
3002                            choices[1] = (int8_t)CNS_11643_1;
3003                        } else {
3004                            choices[1] = (int8_t)GB2312_1;
3005                        }
3006
3007                        choiceCount = 2;
3008                    } else if (converterData->version == 1) {
3009                        /* ISO-2022-CN-EXT */
3010
3011                        /* try one of the other converters */
3012                        switch(choices[0]) {
3013                        case GB2312_1:
3014                            choices[1] = (int8_t)CNS_11643_1;
3015                            choices[2] = (int8_t)ISO_IR_165;
3016                            break;
3017                        case ISO_IR_165:
3018                            choices[1] = (int8_t)GB2312_1;
3019                            choices[2] = (int8_t)CNS_11643_1;
3020                            break;
3021                        default: /* CNS_11643_x */
3022                            choices[1] = (int8_t)GB2312_1;
3023                            choices[2] = (int8_t)ISO_IR_165;
3024                            break;
3025                        }
3026
3027                        choiceCount = 3;
3028                    } else {
3029                        choices[0] = (int8_t)CNS_11643_1;
3030                        choices[1] = (int8_t)GB2312_1;
3031                    }
3032                }
3033
3034                cs = g = 0;
3035                /*
3036                 * len==0: no mapping found yet
3037                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3038                 * len>0: found a roundtrip result, done
3039                 */
3040                len = 0;
3041                /*
3042                 * We will turn off useFallback after finding a fallback,
3043                 * but we still get fallbacks from PUA code points as usual.
3044                 * Therefore, we will also need to check that we don't overwrite
3045                 * an early fallback with a later one.
3046                 */
3047                useFallback = cnv->useFallback;
3048
3049                for(i = 0; i < choiceCount && len <= 0; ++i) {
3050                    int8_t cs0 = choices[i];
3051                    if(cs0 > 0) {
3052                        uint32_t value;
3053                        int32_t len2;
3054                        if(cs0 >= CNS_11643_0) {
3055                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3056                                        converterData->myConverterArray[CNS_11643],
3057                                        sourceChar,
3058                                        &value,
3059                                        useFallback,
3060                                        MBCS_OUTPUT_3);
3061                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3062                                targetValue = value;
3063                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3064                                if(len2 >= 0) {
3065                                    len = 2;
3066                                } else {
3067                                    len = -2;
3068                                    useFallback = FALSE;
3069                                }
3070                                if(cs == CNS_11643_1) {
3071                                    g = 1;
3072                                } else if(cs == CNS_11643_2) {
3073                                    g = 2;
3074                                } else /* plane 3..7 */ if(converterData->version == 1) {
3075                                    g = 3;
3076                                } else {
3077                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3078                                    len = 0;
3079                                }
3080                            }
3081                        } else {
3082                            /* GB2312_1 or ISO-IR-165 */
3083                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3084                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3085                                        converterData->myConverterArray[cs0],
3086                                        sourceChar,
3087                                        &value,
3088                                        useFallback,
3089                                        MBCS_OUTPUT_2);
3090                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3091                                targetValue = value;
3092                                len = len2;
3093                                cs = cs0;
3094                                g = 1;
3095                                useFallback = FALSE;
3096                            }
3097                        }
3098                    }
3099                }
3100
3101                if(len != 0) {
3102                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3103
3104                    /* write the designation sequence if necessary */
3105                    if(cs != pFromU2022State->cs[g]) {
3106                        if(cs < CNS_11643) {
3107                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3108                        } else {
3109                            U_ASSERT(cs >= CNS_11643_1);
3110                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3111                        }
3112                        len = 4;
3113                        pFromU2022State->cs[g] = cs;
3114                        if(g == 1) {
3115                            /* changing the SO/G1 charset invalidates the choices[] */
3116                            choiceCount = 0;
3117                        }
3118                    }
3119
3120                    /* write the shift sequence if necessary */
3121                    if(g != pFromU2022State->g) {
3122                        switch(g) {
3123                        case 1:
3124                            buffer[len++] = UCNV_SO;
3125
3126                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3127                            pFromU2022State->g = 1;
3128                            break;
3129                        case 2:
3130                            buffer[len++] = 0x1b;
3131                            buffer[len++] = 0x4e;
3132                            break;
3133                        default: /* case 3 */
3134                            buffer[len++] = 0x1b;
3135                            buffer[len++] = 0x4f;
3136                            break;
3137                        }
3138                    }
3139
3140                    /* write the two output bytes */
3141                    buffer[len++] = (char)(targetValue >> 8);
3142                    buffer[len++] = (char)targetValue;
3143                } else {
3144                    /* if we cannot find the character after checking all codepages
3145                     * then this is an error
3146                     */
3147                    *err = U_INVALID_CHAR_FOUND;
3148                    cnv->fromUChar32=sourceChar;
3149                    break;
3150                }
3151            }
3152
3153            /* output len>0 bytes in buffer[] */
3154            if(len == 1) {
3155                *target++ = buffer[0];
3156                if(offsets) {
3157                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3158                }
3159            } else if(len == 2 && (target + 2) <= targetLimit) {
3160                *target++ = buffer[0];
3161                *target++ = buffer[1];
3162                if(offsets) {
3163                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3164                    *offsets++ = sourceIndex;
3165                    *offsets++ = sourceIndex;
3166                }
3167            } else {
3168                fromUWriteUInt8(
3169                    cnv,
3170                    buffer, len,
3171                    &target, (const char *)targetLimit,
3172                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3173                    err);
3174                if(U_FAILURE(*err)) {
3175                    break;
3176                }
3177            }
3178        } /* end if(myTargetIndex<myTargetLength) */
3179        else{
3180            *err =U_BUFFER_OVERFLOW_ERROR;
3181            break;
3182        }
3183
3184    }/* end while(mySourceIndex<mySourceLength) */
3185
3186    /*
3187     * the end of the input stream and detection of truncated input
3188     * are handled by the framework, but for ISO-2022-CN conversion
3189     * we need to be in ASCII mode at the very end
3190     *
3191     * conditions:
3192     *   successful
3193     *   not in ASCII mode
3194     *   end of input and no truncated input
3195     */
3196    if( U_SUCCESS(*err) &&
3197        pFromU2022State->g!=0 &&
3198        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3199    ) {
3200        int32_t sourceIndex;
3201
3202        /* we are switching to ASCII */
3203        pFromU2022State->g=0;
3204
3205        /* get the source index of the last input character */
3206        /*
3207         * TODO this would be simpler and more reliable if we used a pair
3208         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3209         * so that we could simply use the prevSourceIndex here;
3210         * this code gives an incorrect result for the rare case of an unmatched
3211         * trail surrogate that is alone in the last buffer of the text stream
3212         */
3213        sourceIndex=(int32_t)(source-args->source);
3214        if(sourceIndex>0) {
3215            --sourceIndex;
3216            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3217                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3218            ) {
3219                --sourceIndex;
3220            }
3221        } else {
3222            sourceIndex=-1;
3223        }
3224
3225        fromUWriteUInt8(
3226            cnv,
3227            SHIFT_IN_STR, 1,
3228            &target, (const char *)targetLimit,
3229            &offsets, sourceIndex,
3230            err);
3231    }
3232
3233    /*save the state and return */
3234    args->source = source;
3235    args->target = (char*)target;
3236}
3237
3238
3239static void
3240UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3241                                               UErrorCode* err){
3242    char tempBuf[3];
3243    const char *mySource = (char *) args->source;
3244    UChar *myTarget = args->target;
3245    const char *mySourceLimit = args->sourceLimit;
3246    uint32_t targetUniChar = 0x0000;
3247    uint32_t mySourceChar = 0x0000;
3248    UConverterDataISO2022* myData;
3249    ISO2022State *pToU2022State;
3250
3251    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3252    pToU2022State = &myData->toU2022State;
3253
3254    if(myData->key != 0) {
3255        /* continue with a partial escape sequence */
3256        goto escape;
3257    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3258        /* continue with a partial double-byte character */
3259        mySourceChar = args->converter->toUBytes[0];
3260        args->converter->toULength = 0;
3261        targetUniChar = missingCharMarker;
3262        goto getTrailByte;
3263    }
3264
3265    while(mySource < mySourceLimit){
3266
3267        targetUniChar =missingCharMarker;
3268
3269        if(myTarget < args->targetLimit){
3270
3271            mySourceChar= (unsigned char) *mySource++;
3272
3273            switch(mySourceChar){
3274            case UCNV_SI:
3275                pToU2022State->g=0;
3276                if (myData->isEmptySegment) {
3277                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
3278                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3279                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3280                    args->converter->toUBytes[0] = mySourceChar;
3281                    args->converter->toULength = 1;
3282                    args->target = myTarget;
3283                    args->source = mySource;
3284                    return;
3285                }
3286                continue;
3287
3288            case UCNV_SO:
3289                if(pToU2022State->cs[1] != 0) {
3290                    pToU2022State->g=1;
3291                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
3292                    continue;
3293                } else {
3294                    /* illegal to have SO before a matching designator */
3295                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
3296                    break;
3297                }
3298
3299            case ESC_2022:
3300                mySource--;
3301escape:
3302                {
3303                    const char * mySourceBefore = mySource;
3304                    int8_t toULengthBefore = args->converter->toULength;
3305
3306                    changeState_2022(args->converter,&(mySource),
3307                        mySourceLimit, ISO_2022_CN,err);
3308
3309                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3310                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3311                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3312                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3313                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3314                    }
3315                }
3316
3317                /* invalid or illegal escape sequence */
3318                if(U_FAILURE(*err)){
3319                    args->target = myTarget;
3320                    args->source = mySource;
3321                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
3322                    return;
3323                }
3324                continue;
3325
3326            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3327
3328            case CR:
3329                /*falls through*/
3330            case LF:
3331                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3332                /* falls through */
3333            default:
3334                /* convert one or two bytes */
3335                myData->isEmptySegment = FALSE;
3336                if(pToU2022State->g != 0) {
3337                    if(mySource < mySourceLimit) {
3338                        UConverterSharedData *cnv;
3339                        StateEnum tempState;
3340                        int32_t tempBufLen;
3341                        int leadIsOk, trailIsOk;
3342                        uint8_t trailByte;
3343getTrailByte:
3344                        trailByte = (uint8_t)*mySource;
3345                        /*
3346                         * Ticket 5691: consistent illegal sequences:
3347                         * - We include at least the first byte in the illegal sequence.
3348                         * - If any of the non-initial bytes could be the start of a character,
3349                         *   we stop the illegal sequence before the first one of those.
3350                         *
3351                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3352                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3353                         * Otherwise we convert or report the pair of bytes.
3354                         */
3355                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3356                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3357                        if (leadIsOk && trailIsOk) {
3358                            ++mySource;
3359                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3360                            if(tempState >= CNS_11643_0) {
3361                                cnv = myData->myConverterArray[CNS_11643];
3362                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3363                                tempBuf[1] = (char) (mySourceChar);
3364                                tempBuf[2] = (char) trailByte;
3365                                tempBufLen = 3;
3366
3367                            }else{
3368                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3369                                cnv = myData->myConverterArray[tempState];
3370                                tempBuf[0] = (char) (mySourceChar);
3371                                tempBuf[1] = (char) trailByte;
3372                                tempBufLen = 2;
3373                            }
3374                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3375                            mySourceChar = (mySourceChar << 8) | trailByte;
3376                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3377                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3378                            ++mySource;
3379                            /* add another bit so that the code below writes 2 bytes in case of error */
3380                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3381                        }
3382                        if(pToU2022State->g>=2) {
3383                            /* return from a single-shift state to the previous one */
3384                            pToU2022State->g=pToU2022State->prevG;
3385                        }
3386                    } else {
3387                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3388                        args->converter->toULength = 1;
3389                        goto endloop;
3390                    }
3391                }
3392                else{
3393                    if(mySourceChar <= 0x7f) {
3394                        targetUniChar = (UChar) mySourceChar;
3395                    }
3396                }
3397                break;
3398            }
3399            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3400                if(args->offsets){
3401                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3402                }
3403                *(myTarget++)=(UChar)targetUniChar;
3404            }
3405            else if(targetUniChar > missingCharMarker){
3406                /* disassemble the surrogate pair and write to output*/
3407                targetUniChar-=0x0010000;
3408                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3409                if(args->offsets){
3410                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3411                }
3412                ++myTarget;
3413                if(myTarget< args->targetLimit){
3414                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3415                    if(args->offsets){
3416                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3417                    }
3418                    ++myTarget;
3419                }else{
3420                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3421                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3422                }
3423
3424            }
3425            else{
3426                /* Call the callback function*/
3427                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3428                break;
3429            }
3430        }
3431        else{
3432            *err =U_BUFFER_OVERFLOW_ERROR;
3433            break;
3434        }
3435    }
3436endloop:
3437    args->target = myTarget;
3438    args->source = mySource;
3439}
3440
3441static void
3442_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3443    UConverter *cnv = args->converter;
3444    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3445    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3446    char *p, *subchar;
3447    char buffer[8];
3448    int32_t length;
3449
3450    subchar=(char *)cnv->subChars;
3451    length=cnv->subCharLen; /* assume length==1 for most variants */
3452
3453    p = buffer;
3454    switch(myConverterData->locale[0]){
3455    case 'j':
3456        {
3457            int8_t cs;
3458
3459            if(pFromU2022State->g == 1) {
3460                /* JIS7: switch from G1 to G0 */
3461                pFromU2022State->g = 0;
3462                *p++ = UCNV_SI;
3463            }
3464
3465            cs = pFromU2022State->cs[0];
3466            if(cs != ASCII && cs != JISX201) {
3467                /* not in ASCII or JIS X 0201: switch to ASCII */
3468                pFromU2022State->cs[0] = (int8_t)ASCII;
3469                *p++ = '\x1b';
3470                *p++ = '\x28';
3471                *p++ = '\x42';
3472            }
3473
3474            *p++ = subchar[0];
3475            break;
3476        }
3477    case 'c':
3478        if(pFromU2022State->g != 0) {
3479            /* not in ASCII mode: switch to ASCII */
3480            pFromU2022State->g = 0;
3481            *p++ = UCNV_SI;
3482        }
3483        *p++ = subchar[0];
3484        break;
3485    case 'k':
3486        if(myConverterData->version == 0) {
3487            if(length == 1) {
3488                if((UBool)args->converter->fromUnicodeStatus) {
3489                    /* in DBCS mode: switch to SBCS */
3490                    args->converter->fromUnicodeStatus = 0;
3491                    *p++ = UCNV_SI;
3492                }
3493                *p++ = subchar[0];
3494            } else /* length == 2*/ {
3495                if(!(UBool)args->converter->fromUnicodeStatus) {
3496                    /* in SBCS mode: switch to DBCS */
3497                    args->converter->fromUnicodeStatus = 1;
3498                    *p++ = UCNV_SO;
3499                }
3500                *p++ = subchar[0];
3501                *p++ = subchar[1];
3502            }
3503            break;
3504        } else {
3505            /* save the subconverter's substitution string */
3506            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3507            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3508
3509            /* set our substitution string into the subconverter */
3510            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3511            myConverterData->currentConverter->subCharLen = (int8_t)length;
3512
3513            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3514            args->converter = myConverterData->currentConverter;
3515            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3516            ucnv_cbFromUWriteSub(args, 0, err);
3517            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3518            args->converter = cnv;
3519
3520            /* restore the subconverter's substitution string */
3521            myConverterData->currentConverter->subChars = currentSubChars;
3522            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3523
3524            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3525                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3526                    uprv_memcpy(
3527                        cnv->charErrorBuffer,
3528                        myConverterData->currentConverter->charErrorBuffer,
3529                        myConverterData->currentConverter->charErrorBufferLength);
3530                }
3531                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3532                myConverterData->currentConverter->charErrorBufferLength = 0;
3533            }
3534            return;
3535        }
3536    default:
3537        /* not expected */
3538        break;
3539    }
3540    ucnv_cbFromUWriteBytes(args,
3541                           buffer, (int32_t)(p - buffer),
3542                           offsetIndex, err);
3543}
3544
3545/*
3546 * Structure for cloning an ISO 2022 converter into a single memory block.
3547 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3548 * and then ucnv_safeClone() of the sub-converter may additionally align
3549 * currentConverter inside the cloneStruct, for which we need the deadSpace
3550 * after currentConverter.
3551 * This is because UAlignedMemory may be larger than the actually
3552 * necessary alignment size for the platform.
3553 * The other cloneStruct fields will not be moved around,
3554 * and are aligned properly with cloneStruct's alignment.
3555 */
3556struct cloneStruct
3557{
3558    UConverter cnv;
3559    UConverter currentConverter;
3560    UAlignedMemory deadSpace;
3561    UConverterDataISO2022 mydata;
3562};
3563
3564
3565static UConverter *
3566_ISO_2022_SafeClone(
3567            const UConverter *cnv,
3568            void *stackBuffer,
3569            int32_t *pBufferSize,
3570            UErrorCode *status)
3571{
3572    struct cloneStruct * localClone;
3573    UConverterDataISO2022 *cnvData;
3574    int32_t i, size;
3575
3576    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3577        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3578        return NULL;
3579    }
3580
3581    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3582    localClone = (struct cloneStruct *)stackBuffer;
3583
3584    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3585
3586    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3587    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3588    localClone->cnv.isExtraLocal = TRUE;
3589
3590    /* share the subconverters */
3591
3592    if(cnvData->currentConverter != NULL) {
3593        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3594        localClone->mydata.currentConverter =
3595            ucnv_safeClone(cnvData->currentConverter,
3596                            &localClone->currentConverter,
3597                            &size, status);
3598        if(U_FAILURE(*status)) {
3599            return NULL;
3600        }
3601    }
3602
3603    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3604        if(cnvData->myConverterArray[i] != NULL) {
3605            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3606        }
3607    }
3608
3609    return &localClone->cnv;
3610}
3611
3612static void
3613_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3614                    const USetAdder *sa,
3615                    UConverterUnicodeSet which,
3616                    UErrorCode *pErrorCode)
3617{
3618    int32_t i;
3619    UConverterDataISO2022* cnvData;
3620
3621    if (U_FAILURE(*pErrorCode)) {
3622        return;
3623    }
3624#ifdef U_ENABLE_GENERIC_ISO_2022
3625    if (cnv->sharedData == &_ISO2022Data) {
3626        /* We use UTF-8 in this case */
3627        sa->addRange(sa->set, 0, 0xd7FF);
3628        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3629        return;
3630    }
3631#endif
3632
3633    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3634
3635    /* open a set and initialize it with code points that are algorithmically round-tripped */
3636    switch(cnvData->locale[0]){
3637    case 'j':
3638        /* include JIS X 0201 which is hardcoded */
3639        sa->add(sa->set, 0xa5);
3640        sa->add(sa->set, 0x203e);
3641        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3642            /* include Latin-1 for some variants of JP */
3643            sa->addRange(sa->set, 0, 0xff);
3644        } else {
3645            /* include ASCII for JP */
3646            sa->addRange(sa->set, 0, 0x7f);
3647        }
3648        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3649            /*
3650             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3651             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3652             * use half-width Katakana.
3653             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3654             * half-width Katakana via the ESC ( I sequence.
3655             * However, we only emit (fromUnicode) half-width Katakana according to the
3656             * definition of each variant.
3657             *
3658             * When including fallbacks,
3659             * we need to include half-width Katakana Unicode code points for all JP variants because
3660             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3661             */
3662            /* include half-width Katakana for JP */
3663            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3664        }
3665        break;
3666    case 'c':
3667    case 'z':
3668        /* include ASCII for CN */
3669        sa->addRange(sa->set, 0, 0x7f);
3670        break;
3671    case 'k':
3672        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3673        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3674                cnvData->currentConverter, sa, which, pErrorCode);
3675        /* the loop over myConverterArray[] will simply not find another converter */
3676        break;
3677    default:
3678        break;
3679    }
3680
3681#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3682            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3683                cnvData->version==0 && i==CNS_11643
3684            ) {
3685                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3686                ucnv_MBCSGetUnicodeSetForBytes(
3687                        cnvData->myConverterArray[i],
3688                        sa, UCNV_ROUNDTRIP_SET,
3689                        0, 0x81, 0x82,
3690                        pErrorCode);
3691            }
3692#endif
3693
3694    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3695        UConverterSetFilter filter;
3696        if(cnvData->myConverterArray[i]!=NULL) {
3697            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3698                cnvData->version==0 && i==CNS_11643
3699            ) {
3700                /*
3701                 * Version-specific for CN:
3702                 * CN version 0 does not map CNS planes 3..7 although
3703                 * they are all available in the CNS conversion table;
3704                 * CN version 1 (-EXT) does map them all.
3705                 * The two versions create different Unicode sets.
3706                 */
3707                filter=UCNV_SET_FILTER_2022_CN;
3708            } else if(cnvData->locale[0]=='j' && i==JISX208) {
3709                /*
3710                 * Only add code points that map to Shift-JIS codes
3711                 * corresponding to JIS X 0208.
3712                 */
3713                filter=UCNV_SET_FILTER_SJIS;
3714            } else if(i==KSC5601) {
3715                /*
3716                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3717                 * are broader than GR94.
3718                 */
3719                filter=UCNV_SET_FILTER_GR94DBCS;
3720            } else {
3721                filter=UCNV_SET_FILTER_NONE;
3722            }
3723            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3724        }
3725    }
3726
3727    /*
3728     * ISO 2022 converters must not convert SO/SI/ESC despite what
3729     * sub-converters do by themselves.
3730     * Remove these characters from the set.
3731     */
3732    sa->remove(sa->set, 0x0e);
3733    sa->remove(sa->set, 0x0f);
3734    sa->remove(sa->set, 0x1b);
3735
3736    /* ISO 2022 converters do not convert C1 controls either */
3737    sa->removeRange(sa->set, 0x80, 0x9f);
3738}
3739
3740static const UConverterImpl _ISO2022Impl={
3741    UCNV_ISO_2022,
3742
3743    NULL,
3744    NULL,
3745
3746    _ISO2022Open,
3747    _ISO2022Close,
3748    _ISO2022Reset,
3749
3750#ifdef U_ENABLE_GENERIC_ISO_2022
3751    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3752    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3753    ucnv_fromUnicode_UTF8,
3754    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3755#else
3756    NULL,
3757    NULL,
3758    NULL,
3759    NULL,
3760#endif
3761    NULL,
3762
3763    NULL,
3764    _ISO2022getName,
3765    _ISO_2022_WriteSub,
3766    _ISO_2022_SafeClone,
3767    _ISO_2022_GetUnicodeSet,
3768
3769    NULL,
3770    NULL
3771};
3772static const UConverterStaticData _ISO2022StaticData={
3773    sizeof(UConverterStaticData),
3774    "ISO_2022",
3775    2022,
3776    UCNV_IBM,
3777    UCNV_ISO_2022,
3778    1,
3779    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3780    { 0x1a, 0, 0, 0 },
3781    1,
3782    FALSE,
3783    FALSE,
3784    0,
3785    0,
3786    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3787};
3788const UConverterSharedData _ISO2022Data={
3789    sizeof(UConverterSharedData),
3790    ~((uint32_t) 0),
3791    NULL,
3792    NULL,
3793    &_ISO2022StaticData,
3794    FALSE,
3795    &_ISO2022Impl,
3796    0, UCNV_MBCS_TABLE_INITIALIZER
3797};
3798
3799/*************JP****************/
3800static const UConverterImpl _ISO2022JPImpl={
3801    UCNV_ISO_2022,
3802
3803    NULL,
3804    NULL,
3805
3806    _ISO2022Open,
3807    _ISO2022Close,
3808    _ISO2022Reset,
3809
3810    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3811    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3812    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3813    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3814    NULL,
3815
3816    NULL,
3817    _ISO2022getName,
3818    _ISO_2022_WriteSub,
3819    _ISO_2022_SafeClone,
3820    _ISO_2022_GetUnicodeSet,
3821
3822    NULL,
3823    NULL
3824};
3825static const UConverterStaticData _ISO2022JPStaticData={
3826    sizeof(UConverterStaticData),
3827    "ISO_2022_JP",
3828    0,
3829    UCNV_IBM,
3830    UCNV_ISO_2022,
3831    1,
3832    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3833    { 0x1a, 0, 0, 0 },
3834    1,
3835    FALSE,
3836    FALSE,
3837    0,
3838    0,
3839    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3840};
3841
3842namespace {
3843
3844const UConverterSharedData _ISO2022JPData={
3845    sizeof(UConverterSharedData),
3846    ~((uint32_t) 0),
3847    NULL,
3848    NULL,
3849    &_ISO2022JPStaticData,
3850    FALSE,
3851    &_ISO2022JPImpl,
3852    0, UCNV_MBCS_TABLE_INITIALIZER
3853};
3854
3855}  // namespace
3856
3857/************* KR ***************/
3858static const UConverterImpl _ISO2022KRImpl={
3859    UCNV_ISO_2022,
3860
3861    NULL,
3862    NULL,
3863
3864    _ISO2022Open,
3865    _ISO2022Close,
3866    _ISO2022Reset,
3867
3868    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3869    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3870    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3871    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3872    NULL,
3873
3874    NULL,
3875    _ISO2022getName,
3876    _ISO_2022_WriteSub,
3877    _ISO_2022_SafeClone,
3878    _ISO_2022_GetUnicodeSet,
3879
3880    NULL,
3881    NULL
3882};
3883static const UConverterStaticData _ISO2022KRStaticData={
3884    sizeof(UConverterStaticData),
3885    "ISO_2022_KR",
3886    0,
3887    UCNV_IBM,
3888    UCNV_ISO_2022,
3889    1,
3890    3, /* max 3 bytes per UChar: SO+DBCS */
3891    { 0x1a, 0, 0, 0 },
3892    1,
3893    FALSE,
3894    FALSE,
3895    0,
3896    0,
3897    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3898};
3899
3900namespace {
3901
3902const UConverterSharedData _ISO2022KRData={
3903    sizeof(UConverterSharedData),
3904    ~((uint32_t) 0),
3905    NULL,
3906    NULL,
3907    &_ISO2022KRStaticData,
3908    FALSE,
3909    &_ISO2022KRImpl,
3910    0, UCNV_MBCS_TABLE_INITIALIZER
3911};
3912
3913}  // namespace
3914
3915/*************** CN ***************/
3916static const UConverterImpl _ISO2022CNImpl={
3917
3918    UCNV_ISO_2022,
3919
3920    NULL,
3921    NULL,
3922
3923    _ISO2022Open,
3924    _ISO2022Close,
3925    _ISO2022Reset,
3926
3927    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3928    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3929    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3930    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3931    NULL,
3932
3933    NULL,
3934    _ISO2022getName,
3935    _ISO_2022_WriteSub,
3936    _ISO_2022_SafeClone,
3937    _ISO_2022_GetUnicodeSet,
3938
3939    NULL,
3940    NULL
3941};
3942static const UConverterStaticData _ISO2022CNStaticData={
3943    sizeof(UConverterStaticData),
3944    "ISO_2022_CN",
3945    0,
3946    UCNV_IBM,
3947    UCNV_ISO_2022,
3948    1,
3949    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3950    { 0x1a, 0, 0, 0 },
3951    1,
3952    FALSE,
3953    FALSE,
3954    0,
3955    0,
3956    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3957};
3958
3959namespace {
3960
3961const UConverterSharedData _ISO2022CNData={
3962    sizeof(UConverterSharedData),
3963    ~((uint32_t) 0),
3964    NULL,
3965    NULL,
3966    &_ISO2022CNStaticData,
3967    FALSE,
3968    &_ISO2022CNImpl,
3969    0, UCNV_MBCS_TABLE_INITIALIZER
3970};
3971
3972}  // namespace
3973
3974#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3975