1/* 2********************************************************************** 3* Copyright (C) 2000-2010, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* file name: ucnv2022.c 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2000feb03 12* created by: Markus W. Scherer 13* 14* Change history: 15* 16* 06/29/2000 helena Major rewrite of the callback APIs. 17* 08/08/2000 Ram Included support for ISO-2022-JP-2 18* Changed implementation of toUnicode 19* function 20* 08/21/2000 Ram Added support for ISO-2022-KR 21* 08/29/2000 Ram Seperated implementation of EBCDIC to 22* ucnvebdc.c 23* 09/20/2000 Ram Added support for ISO-2022-CN 24* Added implementations for getNextUChar() 25* for specific 2022 country variants. 26* 10/31/2000 Ram Implemented offsets logic functions 27*/ 28 29#include "unicode/utypes.h" 30 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33#include "unicode/ucnv.h" 34#include "unicode/uset.h" 35#include "unicode/ucnv_err.h" 36#include "unicode/ucnv_cb.h" 37#include "ucnv_imp.h" 38#include "ucnv_bld.h" 39#include "ucnv_cnv.h" 40#include "ucnvmbcs.h" 41#include "cstring.h" 42#include "cmemory.h" 43 44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 45 46#ifdef U_ENABLE_GENERIC_ISO_2022 47/* 48 * I am disabling the generic ISO-2022 converter after proposing to do so on 49 * the icu mailing list two days ago. 50 * 51 * Reasons: 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 53 * its designation sequences, single shifts with return to the previous state, 54 * switch-with-no-return to UTF-16BE or similar, etc. 55 * This is unlike the language-specific variants like ISO-2022-JP which 56 * require a much smaller repertoire of ISO-2022 features. 57 * These variants continue to be supported. 58 * 2. I believe that no one is really using the generic ISO-2022 converter 59 * but rather always one of the language-specific variants. 60 * Note that ICU's generic ISO-2022 converter has always output one escape 61 * sequence followed by UTF-8 for the whole stream. 62 * 3. Switching between subcharsets is extremely slow, because each time 63 * the previous converter is closed and a new one opened, 64 * without any kind of caching, least-recently-used list, etc. 65 * 4. The code is currently buggy, and given the above it does not seem 66 * reasonable to spend the time on maintenance. 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 68 * This means, for example, that when ISO-8859-7 is designated, the following 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 70 * The ICU ISO-2022 converter does not handle this - and has no information 71 * about which subconverter would have to be shifted vs. which is designed 72 * for 7-bit ISO-2022. 73 * 74 * Markus Scherer 2003-dec-03 75 */ 76#endif 77 78static const char SHIFT_IN_STR[] = "\x0F"; 79static const char SHIFT_OUT_STR[] = "\x0E"; 80 81#define CR 0x0D 82#define LF 0x0A 83#define H_TAB 0x09 84#define V_TAB 0x0B 85#define SPACE 0x20 86 87enum { 88 HWKANA_START=0xff61, 89 HWKANA_END=0xff9f 90}; 91 92/* 93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 94 * as bytes 21..7E. (Subtract 0x80.) 95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 96 * as bytes 20..7F. (Subtract 0x80.) 97 * Do not encode C1 control codes with native bytes 80..9F 98 * as bytes 00..1F (C0 control codes). 99 */ 100enum { 101 GR94_START=0xa1, 102 GR94_END=0xfe, 103 GR96_START=0xa0, 104 GR96_END=0xff 105}; 106 107/* 108 * ISO 2022 control codes must not be converted from Unicode 109 * because they would mess up the byte stream. 110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 111 * corresponding to SO, SI, and ESC. 112 */ 113#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 114 115/* for ISO-2022-JP and -CN implementations */ 116typedef enum { 117 /* shared values */ 118 INVALID_STATE=-1, 119 ASCII = 0, 120 121 SS2_STATE=0x10, 122 SS3_STATE, 123 124 /* JP */ 125 ISO8859_1 = 1 , 126 ISO8859_7 = 2 , 127 JISX201 = 3, 128 JISX208 = 4, 129 JISX212 = 5, 130 GB2312 =6, 131 KSC5601 =7, 132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 133 134 /* CN */ 135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 136 GB2312_1=1, 137 ISO_IR_165=2, 138 CNS_11643=3, 139 140 /* 141 * these are used in StateEnum and ISO2022State variables, 142 * but CNS_11643 must be used to index into myConverterArray[] 143 */ 144 CNS_11643_0=0x20, 145 CNS_11643_1, 146 CNS_11643_2, 147 CNS_11643_3, 148 CNS_11643_4, 149 CNS_11643_5, 150 CNS_11643_6, 151 CNS_11643_7 152} StateEnum; 153 154/* is the StateEnum charset value for a DBCS charset? */ 155#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 156 157#define CSM(cs) ((uint16_t)1<<(cs)) 158 159/* 160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 162 * 163 * Note: The converter uses some leniency: 164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 165 * all versions, not just JIS7 and JIS8. 166 * - ICU does not distinguish between different versions of JIS X 0208. 167 */ 168enum { MAX_JA_VERSION=4 }; 169static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 175}; 176 177typedef enum { 178 ASCII1=0, 179 LATIN1, 180 SBCS, 181 DBCS, 182 MBCS, 183 HWKANA 184}Cnv2022Type; 185 186typedef struct ISO2022State { 187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 189 int8_t prevG; /* g before single shift (SS2 or SS3) */ 190} ISO2022State; 191 192#define UCNV_OPTIONS_VERSION_MASK 0xf 193#define UCNV_2022_MAX_CONVERTERS 10 194 195typedef struct{ 196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 197 UConverter *currentConverter; 198 Cnv2022Type currentType; 199 ISO2022State toU2022State, fromU2022State; 200 uint32_t key; 201 uint32_t version; 202#ifdef U_ENABLE_GENERIC_ISO_2022 203 UBool isFirstBuffer; 204#endif 205 UBool isEmptySegment; 206 char name[30]; 207 char locale[3]; 208}UConverterDataISO2022; 209 210/* Protos */ 211/* ISO-2022 ----------------------------------------------------------------- */ 212 213/*Forward declaration */ 214U_CFUNC void 215ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 216 UErrorCode * err); 217U_CFUNC void 218ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 219 UErrorCode * err); 220 221#define ESC_2022 0x1B /*ESC*/ 222 223typedef enum 224{ 225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 229} UCNV_TableStates_2022; 230 231/* 232* The way these state transition arrays work is: 233* ex : ESC$B is the sequence for JISX208 234* a) First Iteration: char is ESC 235* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 236* int x = normalize_esq_chars_2022[27] which is equal to 1 237* ii) Search for this value in escSeqStateTable_Key_2022[] 238* value of x is stored at escSeqStateTable_Key_2022[0] 239* iii) Save this index as offset 240* iv) Get state of this sequence from escSeqStateTable_Value_2022[] 241* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 242* b) Switch on this state and continue to next char 243* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 244* which is normalize_esq_chars_2022[36] == 4 245* ii) x is currently 1(from above) 246* x<<=5 -- x is now 32 247* x+=normalize_esq_chars_2022[36] 248* now x is 36 249* iii) Search for this value in escSeqStateTable_Key_2022[] 250* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 251* iv) Get state of this sequence from escSeqStateTable_Value_2022[] 252* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 253* c) Switch on this state and continue to next char 254* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 255* ii) x is currently 36 (from above) 256* x<<=5 -- x is now 1152 257* x+=normalize_esq_chars_2022[66] 258* now x is 1161 259* iii) Search for this value in escSeqStateTable_Key_2022[] 260* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 261* iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 262* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 263* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 264*/ 265 266 267/*Below are the 3 arrays depicting a state transition table*/ 268static const int8_t normalize_esq_chars_2022[256] = { 269/* 0 1 2 3 4 5 6 7 8 9 */ 270 271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 297}; 298 299#ifdef U_ENABLE_GENERIC_ISO_2022 300/* 301 * When the generic ISO-2022 converter is completely removed, not just disabled 302 * per #ifdef, then the following state table and the associated tables that are 303 * dimensioned with MAX_STATES_2022 should be trimmed. 304 * 305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 306 * the associated escape sequences starting with ESC ( B should be removed. 307 * This includes the ones with key values 1097 and all of the ones above 1000000. 308 * 309 * For the latter, the tables can simply be truncated. 310 * For the former, since the tables must be kept parallel, it is probably best 311 * to simply duplicate an adjacent table cell, parallel in all tables. 312 * 313 * It may make sense to restructure the tables, especially by using small search 314 * tables for the variants instead of indexing them parallel to the table here. 315 */ 316#endif 317 318#define MAX_STATES_2022 74 319static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 320/* 0 1 2 3 4 5 6 7 8 9 */ 321 322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 329 ,35947631 ,35947635 ,35947636 ,35947638 330}; 331 332#ifdef U_ENABLE_GENERIC_ISO_2022 333 334static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 335 /* 0 1 2 3 4 5 6 7 8 9 */ 336 337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 345}; 346 347#endif 348 349static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 350/* 0 1 2 3 4 5 6 7 8 9 */ 351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359}; 360 361 362/* Type def for refactoring changeState_2022 code*/ 363typedef enum{ 364#ifdef U_ENABLE_GENERIC_ISO_2022 365 ISO_2022=0, 366#endif 367 ISO_2022_JP=1, 368 ISO_2022_KR=2, 369 ISO_2022_CN=3 370} Variant2022; 371 372/*********** ISO 2022 Converter Protos ***********/ 373static void 374_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 375 376static void 377 _ISO2022Close(UConverter *converter); 378 379static void 380_ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 381 382static const char* 383_ISO2022getName(const UConverter* cnv); 384 385static void 386_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 387 388static UConverter * 389_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 390 391#ifdef U_ENABLE_GENERIC_ISO_2022 392static void 393T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 394#endif 395 396/*const UConverterSharedData _ISO2022Data;*/ 397static const UConverterSharedData _ISO2022JPData; 398static const UConverterSharedData _ISO2022KRData; 399static const UConverterSharedData _ISO2022CNData; 400 401/*************** Converter implementations ******************/ 402 403/* The purpose of this function is to get around gcc compiler warnings. */ 404static U_INLINE void 405fromUWriteUInt8(UConverter *cnv, 406 const char *bytes, int32_t length, 407 uint8_t **target, const char *targetLimit, 408 int32_t **offsets, 409 int32_t sourceIndex, 410 UErrorCode *pErrorCode) 411{ 412 char *targetChars = (char *)*target; 413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 414 offsets, sourceIndex, pErrorCode); 415 *target = (uint8_t*)targetChars; 416 417} 418 419static U_INLINE void 420setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){ 421 if(myConverterData->version == 1) { 422 UConverter *cnv = myConverterData->currentConverter; 423 424 cnv->toUnicodeStatus=0; /* offset */ 425 cnv->mode=0; /* state */ 426 cnv->toULength=0; /* byteIndex */ 427 } 428} 429 430static U_INLINE void 431setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 432 /* in ISO-2022-KR the designator sequence appears only once 433 * in a file so we append it only once 434 */ 435 if( converter->charErrorBufferLength==0){ 436 437 converter->charErrorBufferLength = 4; 438 converter->charErrorBuffer[0] = 0x1b; 439 converter->charErrorBuffer[1] = 0x24; 440 converter->charErrorBuffer[2] = 0x29; 441 converter->charErrorBuffer[3] = 0x43; 442 } 443 if(myConverterData->version == 1) { 444 UConverter *cnv = myConverterData->currentConverter; 445 446 cnv->fromUChar32=0; 447 cnv->fromUnicodeStatus=1; /* prevLength */ 448 } 449} 450 451static void 452_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 453 454 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 455 456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 457 if(cnv->extraInfo != NULL) { 458 UConverterNamePieces stackPieces; 459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; 460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 461 uint32_t version; 462 463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 464 465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 466 myConverterData->currentType = ASCII1; 467 cnv->fromUnicodeStatus =FALSE; 468 if(pArgs->locale){ 469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 470 } 471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 472 myConverterData->version = version; 473 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 474 (myLocale[2]=='_' || myLocale[2]=='\0')) 475 { 476 size_t len=0; 477 /* open the required converters and cache them */ 478 if(version>MAX_JA_VERSION) { 479 /* prevent indexing beyond jpCharsetMasks[] */ 480 myConverterData->version = version = 0; 481 } 482 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 483 myConverterData->myConverterArray[ISO8859_7] = 484 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 485 } 486 myConverterData->myConverterArray[JISX208] = 487 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 488 if(jpCharsetMasks[version]&CSM(JISX212)) { 489 myConverterData->myConverterArray[JISX212] = 490 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 491 } 492 if(jpCharsetMasks[version]&CSM(GB2312)) { 493 myConverterData->myConverterArray[GB2312] = 494 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 495 } 496 if(jpCharsetMasks[version]&CSM(KSC5601)) { 497 myConverterData->myConverterArray[KSC5601] = 498 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 499 } 500 501 /* set the function pointers to appropriate funtions */ 502 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 503 uprv_strcpy(myConverterData->locale,"ja"); 504 505 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 506 len = uprv_strlen(myConverterData->name); 507 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 508 myConverterData->name[len+1]='\0'; 509 } 510 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 511 (myLocale[2]=='_' || myLocale[2]=='\0')) 512 { 513 const char *cnvName; 514 if(version==1) { 515 cnvName="icu-internal-25546"; 516 } else { 517 cnvName="ksc_5601"; 518 myConverterData->version=version=0; 519 } 520 if(pArgs->onlyTestIsLoadable) { 521 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 522 uprv_free(cnv->extraInfo); 523 cnv->extraInfo=NULL; 524 return; 525 } else { 526 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 527 if (U_FAILURE(*errorCode)) { 528 _ISO2022Close(cnv); 529 return; 530 } 531 532 if(version==1) { 533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 534 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 535 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 536 }else{ 537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 538 } 539 540 /* initialize the state variables */ 541 setInitialStateToUnicodeKR(cnv, myConverterData); 542 setInitialStateFromUnicodeKR(cnv, myConverterData); 543 544 /* set the function pointers to appropriate funtions */ 545 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 546 uprv_strcpy(myConverterData->locale,"ko"); 547 } 548 } 549 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 550 (myLocale[2]=='_' || myLocale[2]=='\0')) 551 { 552 553 /* open the required converters and cache them */ 554 myConverterData->myConverterArray[GB2312_1] = 555 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 556 if(version==1) { 557 myConverterData->myConverterArray[ISO_IR_165] = 558 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 559 } 560 myConverterData->myConverterArray[CNS_11643] = 561 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 562 563 564 /* set the function pointers to appropriate funtions */ 565 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 566 uprv_strcpy(myConverterData->locale,"cn"); 567 568 if (version==0){ 569 myConverterData->version = 0; 570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 571 }else if (version==1){ 572 myConverterData->version = 1; 573 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 574 }else { 575 myConverterData->version = 2; 576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 577 } 578 } 579 else{ 580#ifdef U_ENABLE_GENERIC_ISO_2022 581 myConverterData->isFirstBuffer = TRUE; 582 583 /* append the UTF-8 escape sequence */ 584 cnv->charErrorBufferLength = 3; 585 cnv->charErrorBuffer[0] = 0x1b; 586 cnv->charErrorBuffer[1] = 0x25; 587 cnv->charErrorBuffer[2] = 0x42; 588 589 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 590 /* initialize the state variables */ 591 uprv_strcpy(myConverterData->name,"ISO_2022"); 592#else 593 *errorCode = U_UNSUPPORTED_ERROR; 594 return; 595#endif 596 } 597 598 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 599 600 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 601 _ISO2022Close(cnv); 602 } 603 } else { 604 *errorCode = U_MEMORY_ALLOCATION_ERROR; 605 } 606} 607 608 609static void 610_ISO2022Close(UConverter *converter) { 611 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 612 UConverterSharedData **array = myData->myConverterArray; 613 int32_t i; 614 615 if (converter->extraInfo != NULL) { 616 /*close the array of converter pointers and free the memory*/ 617 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 618 if(array[i]!=NULL) { 619 ucnv_unloadSharedDataIfReady(array[i]); 620 } 621 } 622 623 ucnv_close(myData->currentConverter); 624 625 if(!converter->isExtraLocal){ 626 uprv_free (converter->extraInfo); 627 converter->extraInfo = NULL; 628 } 629 } 630} 631 632static void 633_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 634 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 635 if(choice<=UCNV_RESET_TO_UNICODE) { 636 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 637 myConverterData->key = 0; 638 myConverterData->isEmptySegment = FALSE; 639 } 640 if(choice!=UCNV_RESET_TO_UNICODE) { 641 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 642 } 643#ifdef U_ENABLE_GENERIC_ISO_2022 644 if(myConverterData->locale[0] == 0){ 645 if(choice<=UCNV_RESET_TO_UNICODE) { 646 myConverterData->isFirstBuffer = TRUE; 647 myConverterData->key = 0; 648 if (converter->mode == UCNV_SO){ 649 ucnv_close (myConverterData->currentConverter); 650 myConverterData->currentConverter=NULL; 651 } 652 converter->mode = UCNV_SI; 653 } 654 if(choice!=UCNV_RESET_TO_UNICODE) { 655 /* re-append UTF-8 escape sequence */ 656 converter->charErrorBufferLength = 3; 657 converter->charErrorBuffer[0] = 0x1b; 658 converter->charErrorBuffer[1] = 0x28; 659 converter->charErrorBuffer[2] = 0x42; 660 } 661 } 662 else 663#endif 664 { 665 /* reset the state variables */ 666 if(myConverterData->locale[0] == 'k'){ 667 if(choice<=UCNV_RESET_TO_UNICODE) { 668 setInitialStateToUnicodeKR(converter, myConverterData); 669 } 670 if(choice!=UCNV_RESET_TO_UNICODE) { 671 setInitialStateFromUnicodeKR(converter, myConverterData); 672 } 673 } 674 } 675} 676 677static const char* 678_ISO2022getName(const UConverter* cnv){ 679 if(cnv->extraInfo){ 680 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 681 return myData->name; 682 } 683 return NULL; 684} 685 686 687/*************** to unicode *******************/ 688/**************************************************************************** 689 * Recognized escape sequences are 690 * <ESC>(B ASCII 691 * <ESC>.A ISO-8859-1 692 * <ESC>.F ISO-8859-7 693 * <ESC>(J JISX-201 694 * <ESC>(I JISX-201 695 * <ESC>$B JISX-208 696 * <ESC>$@ JISX-208 697 * <ESC>$(D JISX-212 698 * <ESC>$A GB2312 699 * <ESC>$(C KSC5601 700 */ 701static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 702/* 0 1 2 3 4 5 6 7 8 9 */ 703 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 704 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 705 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 706 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 707 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 708 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 709 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 710 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 711}; 712 713/*************** to unicode *******************/ 714static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 715/* 0 1 2 3 4 5 6 7 8 9 */ 716 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 717 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 718 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 719 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 720 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 721 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 722 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 724}; 725 726 727static UCNV_TableStates_2022 728getKey_2022(char c,int32_t* key,int32_t* offset){ 729 int32_t togo; 730 int32_t low = 0; 731 int32_t hi = MAX_STATES_2022; 732 int32_t oldmid=0; 733 734 togo = normalize_esq_chars_2022[(uint8_t)c]; 735 if(togo == 0) { 736 /* not a valid character anywhere in an escape sequence */ 737 *key = 0; 738 *offset = 0; 739 return INVALID_2022; 740 } 741 togo = (*key << 5) + togo; 742 743 while (hi != low) /*binary search*/{ 744 745 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 746 747 if (mid == oldmid) 748 break; 749 750 if (escSeqStateTable_Key_2022[mid] > togo){ 751 hi = mid; 752 } 753 else if (escSeqStateTable_Key_2022[mid] < togo){ 754 low = mid; 755 } 756 else /*we found it*/{ 757 *key = togo; 758 *offset = mid; 759 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 760 } 761 oldmid = mid; 762 763 } 764 765 *key = 0; 766 *offset = 0; 767 return INVALID_2022; 768} 769 770/*runs through a state machine to determine the escape sequence - codepage correspondance 771 */ 772static void 773changeState_2022(UConverter* _this, 774 const char** source, 775 const char* sourceLimit, 776 Variant2022 var, 777 UErrorCode* err){ 778 UCNV_TableStates_2022 value; 779 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 780 uint32_t key = myData2022->key; 781 int32_t offset = 0; 782 int8_t initialToULength = _this->toULength; 783 char c; 784 785 value = VALID_NON_TERMINAL_2022; 786 while (*source < sourceLimit) { 787 c = *(*source)++; 788 _this->toUBytes[_this->toULength++]=(uint8_t)c; 789 value = getKey_2022(c,(int32_t *) &key, &offset); 790 791 switch (value){ 792 793 case VALID_NON_TERMINAL_2022 : 794 /* continue with the loop */ 795 break; 796 797 case VALID_TERMINAL_2022: 798 key = 0; 799 goto DONE; 800 801 case INVALID_2022: 802 goto DONE; 803 804 case VALID_MAYBE_TERMINAL_2022: 805#ifdef U_ENABLE_GENERIC_ISO_2022 806 /* ESC ( B is ambiguous only for ISO_2022 itself */ 807 if(var == ISO_2022) { 808 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 809 _this->toULength = 0; 810 811 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 812 813 /* continue with the loop */ 814 value = VALID_NON_TERMINAL_2022; 815 break; 816 } else 817#endif 818 { 819 /* not ISO_2022 itself, finish here */ 820 value = VALID_TERMINAL_2022; 821 key = 0; 822 goto DONE; 823 } 824 } 825 } 826 827DONE: 828 myData2022->key = key; 829 830 if (value == VALID_NON_TERMINAL_2022) { 831 /* indicate that the escape sequence is incomplete: key!=0 */ 832 return; 833 } else if (value == INVALID_2022 ) { 834 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 835 } else /* value == VALID_TERMINAL_2022 */ { 836 switch(var){ 837#ifdef U_ENABLE_GENERIC_ISO_2022 838 case ISO_2022: 839 { 840 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 841 if(chosenConverterName == NULL) { 842 /* SS2 or SS3 */ 843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 844 _this->toUCallbackReason = UCNV_UNASSIGNED; 845 return; 846 } 847 848 _this->mode = UCNV_SI; 849 ucnv_close(myData2022->currentConverter); 850 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 851 if(U_SUCCESS(*err)) { 852 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 853 _this->mode = UCNV_SO; 854 } 855 break; 856 } 857#endif 858 case ISO_2022_JP: 859 { 860 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 861 switch(tempState) { 862 case INVALID_STATE: 863 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 864 break; 865 case SS2_STATE: 866 if(myData2022->toU2022State.cs[2]!=0) { 867 if(myData2022->toU2022State.g<2) { 868 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 869 } 870 myData2022->toU2022State.g=2; 871 } else { 872 /* illegal to have SS2 before a matching designator */ 873 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 874 } 875 break; 876 /* case SS3_STATE: not used in ISO-2022-JP-x */ 877 case ISO8859_1: 878 case ISO8859_7: 879 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 880 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 881 } else { 882 /* G2 charset for SS2 */ 883 myData2022->toU2022State.cs[2]=(int8_t)tempState; 884 } 885 break; 886 default: 887 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 888 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 889 } else { 890 /* G0 charset */ 891 myData2022->toU2022State.cs[0]=(int8_t)tempState; 892 } 893 break; 894 } 895 } 896 break; 897 case ISO_2022_CN: 898 { 899 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 900 switch(tempState) { 901 case INVALID_STATE: 902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 903 break; 904 case SS2_STATE: 905 if(myData2022->toU2022State.cs[2]!=0) { 906 if(myData2022->toU2022State.g<2) { 907 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 908 } 909 myData2022->toU2022State.g=2; 910 } else { 911 /* illegal to have SS2 before a matching designator */ 912 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 913 } 914 break; 915 case SS3_STATE: 916 if(myData2022->toU2022State.cs[3]!=0) { 917 if(myData2022->toU2022State.g<2) { 918 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 919 } 920 myData2022->toU2022State.g=3; 921 } else { 922 /* illegal to have SS3 before a matching designator */ 923 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 924 } 925 break; 926 case ISO_IR_165: 927 if(myData2022->version==0) { 928 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 929 break; 930 } 931 /*fall through*/ 932 case GB2312_1: 933 /*fall through*/ 934 case CNS_11643_1: 935 myData2022->toU2022State.cs[1]=(int8_t)tempState; 936 break; 937 case CNS_11643_2: 938 myData2022->toU2022State.cs[2]=(int8_t)tempState; 939 break; 940 default: 941 /* other CNS 11643 planes */ 942 if(myData2022->version==0) { 943 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 944 } else { 945 myData2022->toU2022State.cs[3]=(int8_t)tempState; 946 } 947 break; 948 } 949 } 950 break; 951 case ISO_2022_KR: 952 if(offset==0x30){ 953 /* nothing to be done, just accept this one escape sequence */ 954 } else { 955 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 956 } 957 break; 958 959 default: 960 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 961 break; 962 } 963 } 964 if(U_SUCCESS(*err)) { 965 _this->toULength = 0; 966 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 967 if(_this->toULength>1) { 968 /* 969 * Ticket 5691: consistent illegal sequences: 970 * - We include at least the first byte (ESC) in the illegal sequence. 971 * - If any of the non-initial bytes could be the start of a character, 972 * we stop the illegal sequence before the first one of those. 973 * In escape sequences, all following bytes are "printable", that is, 974 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 975 * they are valid single/lead bytes. 976 * For simplicity, we always only report the initial ESC byte as the 977 * illegal sequence and back out all other bytes we looked at. 978 */ 979 /* Back out some bytes. */ 980 int8_t backOutDistance=_this->toULength-1; 981 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 982 if(backOutDistance<=bytesFromThisBuffer) { 983 /* same as initialToULength<=1 */ 984 *source-=backOutDistance; 985 } else { 986 /* Back out bytes from the previous buffer: Need to replay them. */ 987 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 988 /* same as -(initialToULength-1) */ 989 /* preToULength is negative! */ 990 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 991 *source-=bytesFromThisBuffer; 992 } 993 _this->toULength=1; 994 } 995 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 996 _this->toUCallbackReason = UCNV_UNASSIGNED; 997 } 998} 999 1000/*Checks the characters of the buffer against valid 2022 escape sequences 1001*if the match we return a pointer to the initial start of the sequence otherwise 1002*we return sourceLimit 1003*/ 1004/*for 2022 looks ahead in the stream 1005 *to determine the longest possible convertible 1006 *data stream 1007 */ 1008static U_INLINE const char* 1009getEndOfBuffer_2022(const char** source, 1010 const char* sourceLimit, 1011 UBool flush){ 1012 1013 const char* mySource = *source; 1014 1015#ifdef U_ENABLE_GENERIC_ISO_2022 1016 if (*source >= sourceLimit) 1017 return sourceLimit; 1018 1019 do{ 1020 1021 if (*mySource == ESC_2022){ 1022 int8_t i; 1023 int32_t key = 0; 1024 int32_t offset; 1025 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1026 1027 /* Kludge: I could not 1028 * figure out the reason for validating an escape sequence 1029 * twice - once here and once in changeState_2022(). 1030 * is it possible to have an ESC character in a ISO2022 1031 * byte stream which is valid in a code page? Is it legal? 1032 */ 1033 for (i=0; 1034 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1035 i++) { 1036 value = getKey_2022(*(mySource+i), &key, &offset); 1037 } 1038 if (value > 0 || *mySource==ESC_2022) 1039 return mySource; 1040 1041 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1042 return sourceLimit; 1043 } 1044 }while (++mySource < sourceLimit); 1045 1046 return sourceLimit; 1047#else 1048 while(mySource < sourceLimit && *mySource != ESC_2022) { 1049 ++mySource; 1050 } 1051 return mySource; 1052#endif 1053} 1054 1055 1056/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1057 * any future change in _MBCSFromUChar32() function should be reflected here. 1058 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1059 */ 1060static U_INLINE int32_t 1061MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1062 UChar32 c, 1063 uint32_t* value, 1064 UBool useFallback, 1065 int outputType) 1066{ 1067 const int32_t *cx; 1068 const uint16_t *table; 1069 uint32_t stage2Entry; 1070 uint32_t myValue; 1071 int32_t length; 1072 const uint8_t *p; 1073 /* 1074 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1075 * Use internal version of ucnv_open() that verifies that the new structures are available, 1076 * else U_INTERNAL_PROGRAM_ERROR. 1077 */ 1078 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1079 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1080 table=sharedData->mbcs.fromUnicodeTable; 1081 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1082 /* get the bytes and the length for the output */ 1083 if(outputType==MBCS_OUTPUT_2){ 1084 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1085 if(myValue<=0xff) { 1086 length=1; 1087 } else { 1088 length=2; 1089 } 1090 } else /* outputType==MBCS_OUTPUT_3 */ { 1091 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1092 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1093 if(myValue<=0xff) { 1094 length=1; 1095 } else if(myValue<=0xffff) { 1096 length=2; 1097 } else { 1098 length=3; 1099 } 1100 } 1101 /* is this code point assigned, or do we use fallbacks? */ 1102 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1103 /* assigned */ 1104 *value=myValue; 1105 return length; 1106 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1107 /* 1108 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1109 * There is no way with this data structure for fallback output 1110 * to be a zero byte. 1111 */ 1112 *value=myValue; 1113 return -length; 1114 } 1115 } 1116 1117 cx=sharedData->mbcs.extIndexes; 1118 if(cx!=NULL) { 1119 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1120 } 1121 1122 /* unassigned */ 1123 return 0; 1124} 1125 1126/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1127 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1128 * @param retval pointer to output byte 1129 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1130 */ 1131static U_INLINE int32_t 1132MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1133 UChar32 c, 1134 uint32_t* retval, 1135 UBool useFallback) 1136{ 1137 const uint16_t *table; 1138 int32_t value; 1139 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1140 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1141 return 0; 1142 } 1143 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1144 table=sharedData->mbcs.fromUnicodeTable; 1145 /* get the byte for the output */ 1146 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1147 /* is this code point assigned, or do we use fallbacks? */ 1148 *retval=(uint32_t)(value&0xff); 1149 if(value>=0xf00) { 1150 return 1; /* roundtrip */ 1151 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1152 return -1; /* fallback taken */ 1153 } else { 1154 return 0; /* no mapping */ 1155 } 1156} 1157 1158/* 1159 * Check that the result is a 2-byte value with each byte in the range A1..FE 1160 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1161 * to move it to the ISO 2022 range 21..7E. 1162 * Return 0 if out of range. 1163 */ 1164static U_INLINE uint32_t 1165_2022FromGR94DBCS(uint32_t value) { 1166 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1167 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1168 ) { 1169 return value - 0x8080; /* shift down to 21..7e byte range */ 1170 } else { 1171 return 0; /* not valid for ISO 2022 */ 1172 } 1173} 1174 1175#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1176/* 1177 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1178 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1179 * unchanged. 1180 */ 1181static U_INLINE uint32_t 1182_2022ToGR94DBCS(uint32_t value) { 1183 uint32_t returnValue = value + 0x8080; 1184 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1185 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1186 return returnValue; 1187 } else { 1188 return value; 1189 } 1190} 1191#endif 1192 1193#ifdef U_ENABLE_GENERIC_ISO_2022 1194 1195/********************************************************************************** 1196* ISO-2022 Converter 1197* 1198* 1199*/ 1200 1201static void 1202T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1203 UErrorCode* err){ 1204 const char* mySourceLimit, *realSourceLimit; 1205 const char* sourceStart; 1206 const UChar* myTargetStart; 1207 UConverter* saveThis; 1208 UConverterDataISO2022* myData; 1209 int8_t length; 1210 1211 saveThis = args->converter; 1212 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1213 1214 realSourceLimit = args->sourceLimit; 1215 while (args->source < realSourceLimit) { 1216 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1217 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1218 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1219 1220 if(args->source < mySourceLimit) { 1221 if(myData->currentConverter==NULL) { 1222 myData->currentConverter = ucnv_open("ASCII",err); 1223 if(U_FAILURE(*err)){ 1224 return; 1225 } 1226 1227 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1228 saveThis->mode = UCNV_SO; 1229 } 1230 1231 /* convert to before the ESC or until the end of the buffer */ 1232 myData->isFirstBuffer=FALSE; 1233 sourceStart = args->source; 1234 myTargetStart = args->target; 1235 args->converter = myData->currentConverter; 1236 ucnv_toUnicode(args->converter, 1237 &args->target, 1238 args->targetLimit, 1239 &args->source, 1240 mySourceLimit, 1241 args->offsets, 1242 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1243 err); 1244 args->converter = saveThis; 1245 1246 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1247 /* move the overflow buffer */ 1248 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1249 myData->currentConverter->UCharErrorBufferLength = 0; 1250 if(length > 0) { 1251 uprv_memcpy(saveThis->UCharErrorBuffer, 1252 myData->currentConverter->UCharErrorBuffer, 1253 length*U_SIZEOF_UCHAR); 1254 } 1255 return; 1256 } 1257 1258 /* 1259 * At least one of: 1260 * -Error while converting 1261 * -Done with entire buffer 1262 * -Need to write offsets or update the current offset 1263 * (leave that up to the code in ucnv.c) 1264 * 1265 * or else we just stopped at an ESC byte and continue with changeState_2022() 1266 */ 1267 if (U_FAILURE(*err) || 1268 (args->source == realSourceLimit) || 1269 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1270 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1271 ) { 1272 /* copy partial or error input for truncated detection and error handling */ 1273 if(U_FAILURE(*err)) { 1274 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1275 if(length > 0) { 1276 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1277 } 1278 } else { 1279 length = saveThis->toULength = myData->currentConverter->toULength; 1280 if(length > 0) { 1281 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1282 if(args->source < mySourceLimit) { 1283 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1284 } 1285 } 1286 } 1287 return; 1288 } 1289 } 1290 } 1291 1292 sourceStart = args->source; 1293 changeState_2022(args->converter, 1294 &(args->source), 1295 realSourceLimit, 1296 ISO_2022, 1297 err); 1298 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1299 /* let the ucnv.c code update its current offset */ 1300 return; 1301 } 1302 } 1303} 1304 1305#endif 1306 1307/* 1308 * To Unicode Callback helper function 1309 */ 1310static void 1311toUnicodeCallback(UConverter *cnv, 1312 const uint32_t sourceChar, const uint32_t targetUniChar, 1313 UErrorCode* err){ 1314 if(sourceChar>0xff){ 1315 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1316 cnv->toUBytes[1] = (uint8_t)sourceChar; 1317 cnv->toULength = 2; 1318 } 1319 else{ 1320 cnv->toUBytes[0] =(char) sourceChar; 1321 cnv->toULength = 1; 1322 } 1323 1324 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1325 *err = U_INVALID_CHAR_FOUND; 1326 } 1327 else{ 1328 *err = U_ILLEGAL_CHAR_FOUND; 1329 } 1330} 1331 1332/**************************************ISO-2022-JP*************************************************/ 1333 1334/************************************** IMPORTANT ************************************************** 1335* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1336* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1337* The converter iterates over each Unicode codepoint 1338* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1339* processed one char at a time it would make sense to reduce the extra processing a canned converter 1340* would do as far as possible. 1341* 1342* If the implementation of these macros or structure of sharedData struct change in the future, make 1343* sure that ISO-2022 is also changed. 1344*************************************************************************************************** 1345*/ 1346 1347/*************************************************************************************************** 1348* Rules for ISO-2022-jp encoding 1349* (i) Escape sequences must be fully contained within a line they should not 1350* span new lines or CRs 1351* (ii) If the last character on a line is represented by two bytes then an ASCII or 1352* JIS-Roman character escape sequence should follow before the line terminates 1353* (iii) If the first character on the line is represented by two bytes then a two 1354* byte character escape sequence should precede it 1355* (iv) If no escape sequence is encountered then the characters are ASCII 1356* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1357* and invoked with SS2 (ESC N). 1358* (vi) If there is any G0 designation in text, there must be a switch to 1359* ASCII or to JIS X 0201-Roman before a space character (but not 1360* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1361* characters such as tab or CRLF. 1362* (vi) Supported encodings: 1363* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1364* 1365* source : RFC-1554 1366* 1367* JISX201, JISX208,JISX212 : new .cnv data files created 1368* KSC5601 : alias to ibm-949 mapping table 1369* GB2312 : alias to ibm-1386 mapping table 1370* ISO-8859-1 : Algorithmic implemented as LATIN1 case 1371* ISO-8859-7 : alisas to ibm-9409 mapping table 1372*/ 1373 1374/* preference order of JP charsets */ 1375static const StateEnum jpCharsetPref[]={ 1376 ASCII, 1377 JISX201, 1378 ISO8859_1, 1379 ISO8859_7, 1380 JISX208, 1381 JISX212, 1382 GB2312, 1383 KSC5601, 1384 HWKANA_7BIT 1385}; 1386 1387/* 1388 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1389 * not in order of jpCharsetPref[]! 1390 */ 1391static const char escSeqChars[][6] ={ 1392 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1393 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1394 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1395 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1396 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1397 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1398 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1399 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1400 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1401 1402}; 1403static const int8_t escSeqCharsLen[] ={ 1404 3, /* length of <ESC>(B ASCII */ 1405 3, /* length of <ESC>.A ISO-8859-1 */ 1406 3, /* length of <ESC>.F ISO-8859-7 */ 1407 3, /* length of <ESC>(J JISX-201 */ 1408 3, /* length of <ESC>$B JISX-208 */ 1409 4, /* length of <ESC>$(D JISX-212 */ 1410 3, /* length of <ESC>$A GB2312 */ 1411 4, /* length of <ESC>$(C KSC5601 */ 1412 3 /* length of <ESC>(I HWKANA_7BIT */ 1413}; 1414 1415/* 1416* The iteration over various code pages works this way: 1417* i) Get the currentState from myConverterData->currentState 1418* ii) Check if the character is mapped to a valid character in the currentState 1419* Yes -> a) set the initIterState to currentState 1420* b) remain in this state until an invalid character is found 1421* No -> a) go to the next code page and find the character 1422* iii) Before changing the state increment the current state check if the current state 1423* is equal to the intitIteration state 1424* Yes -> A character that cannot be represented in any of the supported encodings 1425* break and return a U_INVALID_CHARACTER error 1426* No -> Continue and find the character in next code page 1427* 1428* 1429* TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1430*/ 1431 1432/* Map 00..7F to Unicode according to JIS X 0201. */ 1433static U_INLINE uint32_t 1434jisx201ToU(uint32_t value) { 1435 if(value < 0x5c) { 1436 return value; 1437 } else if(value == 0x5c) { 1438 return 0xa5; 1439 } else if(value == 0x7e) { 1440 return 0x203e; 1441 } else /* value <= 0x7f */ { 1442 return value; 1443 } 1444} 1445 1446/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1447static U_INLINE uint32_t 1448jisx201FromU(uint32_t value) { 1449 if(value<=0x7f) { 1450 if(value!=0x5c && value!=0x7e) { 1451 return value; 1452 } 1453 } else if(value==0xa5) { 1454 return 0x5c; 1455 } else if(value==0x203e) { 1456 return 0x7e; 1457 } 1458 return 0xfffe; 1459} 1460 1461/* 1462 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1463 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1464 * Return 0 if the byte pair is out of range. 1465 */ 1466static U_INLINE uint32_t 1467_2022FromSJIS(uint32_t value) { 1468 uint8_t trail; 1469 1470 if(value > 0xEFFC) { 1471 return 0; /* beyond JIS X 0208 */ 1472 } 1473 1474 trail = (uint8_t)value; 1475 1476 value &= 0xff00; /* lead byte */ 1477 if(value <= 0x9f00) { 1478 value -= 0x7000; 1479 } else /* 0xe000 <= value <= 0xef00 */ { 1480 value -= 0xb000; 1481 } 1482 value <<= 1; 1483 1484 if(trail <= 0x9e) { 1485 value -= 0x100; 1486 if(trail <= 0x7e) { 1487 value |= trail - 0x1f; 1488 } else { 1489 value |= trail - 0x20; 1490 } 1491 } else /* trail <= 0xfc */ { 1492 value |= trail - 0x7e; 1493 } 1494 return value; 1495} 1496 1497/* 1498 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1499 * If either byte is outside 21..7E make sure that the result is not valid 1500 * for Shift-JIS so that the converter catches it. 1501 * Some invalid byte values already turn into equally invalid Shift-JIS 1502 * byte values and need not be tested explicitly. 1503 */ 1504static U_INLINE void 1505_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1506 if(c1&1) { 1507 ++c1; 1508 if(c2 <= 0x5f) { 1509 c2 += 0x1f; 1510 } else if(c2 <= 0x7e) { 1511 c2 += 0x20; 1512 } else { 1513 c2 = 0; /* invalid */ 1514 } 1515 } else { 1516 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1517 c2 += 0x7e; 1518 } else { 1519 c2 = 0; /* invalid */ 1520 } 1521 } 1522 c1 >>= 1; 1523 if(c1 <= 0x2f) { 1524 c1 += 0x70; 1525 } else if(c1 <= 0x3f) { 1526 c1 += 0xb0; 1527 } else { 1528 c1 = 0; /* invalid */ 1529 } 1530 bytes[0] = (char)c1; 1531 bytes[1] = (char)c2; 1532} 1533 1534/* 1535 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1536 * Katakana. 1537 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1538 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1539 * These were the only fallbacks in ICU's jisx-208.ucm file. 1540 */ 1541static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1542 0x2123, /* U+FF61 */ 1543 0x2156, 1544 0x2157, 1545 0x2122, 1546 0x2126, 1547 0x2572, 1548 0x2521, 1549 0x2523, 1550 0x2525, 1551 0x2527, 1552 0x2529, 1553 0x2563, 1554 0x2565, 1555 0x2567, 1556 0x2543, 1557 0x213C, /* U+FF70 */ 1558 0x2522, 1559 0x2524, 1560 0x2526, 1561 0x2528, 1562 0x252A, 1563 0x252B, 1564 0x252D, 1565 0x252F, 1566 0x2531, 1567 0x2533, 1568 0x2535, 1569 0x2537, 1570 0x2539, 1571 0x253B, 1572 0x253D, 1573 0x253F, /* U+FF80 */ 1574 0x2541, 1575 0x2544, 1576 0x2546, 1577 0x2548, 1578 0x254A, 1579 0x254B, 1580 0x254C, 1581 0x254D, 1582 0x254E, 1583 0x254F, 1584 0x2552, 1585 0x2555, 1586 0x2558, 1587 0x255B, 1588 0x255E, 1589 0x255F, /* U+FF90 */ 1590 0x2560, 1591 0x2561, 1592 0x2562, 1593 0x2564, 1594 0x2566, 1595 0x2568, 1596 0x2569, 1597 0x256A, 1598 0x256B, 1599 0x256C, 1600 0x256D, 1601 0x256F, 1602 0x2573, 1603 0x212B, 1604 0x212C /* U+FF9F */ 1605}; 1606 1607static void 1608UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1609 UConverter *cnv = args->converter; 1610 UConverterDataISO2022 *converterData; 1611 ISO2022State *pFromU2022State; 1612 uint8_t *target = (uint8_t *) args->target; 1613 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1614 const UChar* source = args->source; 1615 const UChar* sourceLimit = args->sourceLimit; 1616 int32_t* offsets = args->offsets; 1617 UChar32 sourceChar; 1618 char buffer[8]; 1619 int32_t len, outLen; 1620 int8_t choices[10]; 1621 int32_t choiceCount; 1622 uint32_t targetValue = 0; 1623 UBool useFallback; 1624 1625 int32_t i; 1626 int8_t cs, g; 1627 1628 /* set up the state */ 1629 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1630 pFromU2022State = &converterData->fromU2022State; 1631 1632 choiceCount = 0; 1633 1634 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1635 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1636 goto getTrail; 1637 } 1638 1639 while(source < sourceLimit) { 1640 if(target < targetLimit) { 1641 1642 sourceChar = *(source++); 1643 /*check if the char is a First surrogate*/ 1644 if(UTF_IS_SURROGATE(sourceChar)) { 1645 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 1646getTrail: 1647 /*look ahead to find the trail surrogate*/ 1648 if(source < sourceLimit) { 1649 /* test the following code unit */ 1650 UChar trail=(UChar) *source; 1651 if(UTF_IS_SECOND_SURROGATE(trail)) { 1652 source++; 1653 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 1654 cnv->fromUChar32=0x00; 1655 /* convert this supplementary code point */ 1656 /* exit this condition tree */ 1657 } else { 1658 /* this is an unmatched lead code unit (1st surrogate) */ 1659 /* callback(illegal) */ 1660 *err=U_ILLEGAL_CHAR_FOUND; 1661 cnv->fromUChar32=sourceChar; 1662 break; 1663 } 1664 } else { 1665 /* no more input */ 1666 cnv->fromUChar32=sourceChar; 1667 break; 1668 } 1669 } else { 1670 /* this is an unmatched trail code unit (2nd surrogate) */ 1671 /* callback(illegal) */ 1672 *err=U_ILLEGAL_CHAR_FOUND; 1673 cnv->fromUChar32=sourceChar; 1674 break; 1675 } 1676 } 1677 1678 /* do not convert SO/SI/ESC */ 1679 if(IS_2022_CONTROL(sourceChar)) { 1680 /* callback(illegal) */ 1681 *err=U_ILLEGAL_CHAR_FOUND; 1682 cnv->fromUChar32=sourceChar; 1683 break; 1684 } 1685 1686 /* do the conversion */ 1687 1688 if(choiceCount == 0) { 1689 uint16_t csm; 1690 1691 /* 1692 * The csm variable keeps track of which charsets are allowed 1693 * and not used yet while building the choices[]. 1694 */ 1695 csm = jpCharsetMasks[converterData->version]; 1696 choiceCount = 0; 1697 1698 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1699 if(converterData->version == 3 || converterData->version == 4) { 1700 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1701 } 1702 /* Do not try single-byte half-width Katakana for other versions. */ 1703 csm &= ~CSM(HWKANA_7BIT); 1704 1705 /* try the current G0 charset */ 1706 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1707 csm &= ~CSM(cs); 1708 1709 /* try the current G2 charset */ 1710 if((cs = pFromU2022State->cs[2]) != 0) { 1711 choices[choiceCount++] = cs; 1712 csm &= ~CSM(cs); 1713 } 1714 1715 /* try all the other possible charsets */ 1716 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1717 cs = (int8_t)jpCharsetPref[i]; 1718 if(CSM(cs) & csm) { 1719 choices[choiceCount++] = cs; 1720 csm &= ~CSM(cs); 1721 } 1722 } 1723 } 1724 1725 cs = g = 0; 1726 /* 1727 * len==0: no mapping found yet 1728 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1729 * len>0: found a roundtrip result, done 1730 */ 1731 len = 0; 1732 /* 1733 * We will turn off useFallback after finding a fallback, 1734 * but we still get fallbacks from PUA code points as usual. 1735 * Therefore, we will also need to check that we don't overwrite 1736 * an early fallback with a later one. 1737 */ 1738 useFallback = cnv->useFallback; 1739 1740 for(i = 0; i < choiceCount && len <= 0; ++i) { 1741 uint32_t value; 1742 int32_t len2; 1743 int8_t cs0 = choices[i]; 1744 switch(cs0) { 1745 case ASCII: 1746 if(sourceChar <= 0x7f) { 1747 targetValue = (uint32_t)sourceChar; 1748 len = 1; 1749 cs = cs0; 1750 g = 0; 1751 } 1752 break; 1753 case ISO8859_1: 1754 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1755 targetValue = (uint32_t)sourceChar - 0x80; 1756 len = 1; 1757 cs = cs0; 1758 g = 2; 1759 } 1760 break; 1761 case HWKANA_7BIT: 1762 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1763 if(converterData->version==3) { 1764 /* JIS7: use G1 (SO) */ 1765 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1766 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1767 len = 1; 1768 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1769 g = 1; 1770 } else if(converterData->version==4) { 1771 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1772 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1773 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1774 len = 1; 1775 1776 cs = pFromU2022State->cs[0]; 1777 if(IS_JP_DBCS(cs)) { 1778 /* switch from a DBCS charset to JISX201 */ 1779 cs = (int8_t)JISX201; 1780 } 1781 /* else stay in the current G0 charset */ 1782 g = 0; 1783 } 1784 /* else do not use HWKANA_7BIT with other versions */ 1785 } 1786 break; 1787 case JISX201: 1788 /* G0 SBCS */ 1789 value = jisx201FromU(sourceChar); 1790 if(value <= 0x7f) { 1791 targetValue = value; 1792 len = 1; 1793 cs = cs0; 1794 g = 0; 1795 useFallback = FALSE; 1796 } 1797 break; 1798 case JISX208: 1799 /* G0 DBCS from Shift-JIS table */ 1800 len2 = MBCS_FROM_UCHAR32_ISO2022( 1801 converterData->myConverterArray[cs0], 1802 sourceChar, &value, 1803 useFallback, MBCS_OUTPUT_2); 1804 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1805 value = _2022FromSJIS(value); 1806 if(value != 0) { 1807 targetValue = value; 1808 len = len2; 1809 cs = cs0; 1810 g = 0; 1811 useFallback = FALSE; 1812 } 1813 } else if(len == 0 && useFallback && 1814 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1815 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1816 len = -2; 1817 cs = cs0; 1818 g = 0; 1819 useFallback = FALSE; 1820 } 1821 break; 1822 case ISO8859_7: 1823 /* G0 SBCS forced to 7-bit output */ 1824 len2 = MBCS_SINGLE_FROM_UCHAR32( 1825 converterData->myConverterArray[cs0], 1826 sourceChar, &value, 1827 useFallback); 1828 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1829 targetValue = value - 0x80; 1830 len = len2; 1831 cs = cs0; 1832 g = 2; 1833 useFallback = FALSE; 1834 } 1835 break; 1836 default: 1837 /* G0 DBCS */ 1838 len2 = MBCS_FROM_UCHAR32_ISO2022( 1839 converterData->myConverterArray[cs0], 1840 sourceChar, &value, 1841 useFallback, MBCS_OUTPUT_2); 1842 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1843 if(cs0 == KSC5601) { 1844 /* 1845 * Check for valid bytes for the encoding scheme. 1846 * This is necessary because the sub-converter (windows-949) 1847 * has a broader encoding scheme than is valid for 2022. 1848 */ 1849 value = _2022FromGR94DBCS(value); 1850 if(value == 0) { 1851 break; 1852 } 1853 } 1854 targetValue = value; 1855 len = len2; 1856 cs = cs0; 1857 g = 0; 1858 useFallback = FALSE; 1859 } 1860 break; 1861 } 1862 } 1863 1864 if(len != 0) { 1865 if(len < 0) { 1866 len = -len; /* fallback */ 1867 } 1868 outLen = 0; /* count output bytes */ 1869 1870 /* write SI if necessary (only for JIS7) */ 1871 if(pFromU2022State->g == 1 && g == 0) { 1872 buffer[outLen++] = UCNV_SI; 1873 pFromU2022State->g = 0; 1874 } 1875 1876 /* write the designation sequence if necessary */ 1877 if(cs != pFromU2022State->cs[g]) { 1878 int32_t escLen = escSeqCharsLen[cs]; 1879 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1880 outLen += escLen; 1881 pFromU2022State->cs[g] = cs; 1882 1883 /* invalidate the choices[] */ 1884 choiceCount = 0; 1885 } 1886 1887 /* write the shift sequence if necessary */ 1888 if(g != pFromU2022State->g) { 1889 switch(g) { 1890 /* case 0 handled before writing escapes */ 1891 case 1: 1892 buffer[outLen++] = UCNV_SO; 1893 pFromU2022State->g = 1; 1894 break; 1895 default: /* case 2 */ 1896 buffer[outLen++] = 0x1b; 1897 buffer[outLen++] = 0x4e; 1898 break; 1899 /* no case 3: no SS3 in ISO-2022-JP-x */ 1900 } 1901 } 1902 1903 /* write the output bytes */ 1904 if(len == 1) { 1905 buffer[outLen++] = (char)targetValue; 1906 } else /* len == 2 */ { 1907 buffer[outLen++] = (char)(targetValue >> 8); 1908 buffer[outLen++] = (char)targetValue; 1909 } 1910 } else { 1911 /* 1912 * if we cannot find the character after checking all codepages 1913 * then this is an error 1914 */ 1915 *err = U_INVALID_CHAR_FOUND; 1916 cnv->fromUChar32=sourceChar; 1917 break; 1918 } 1919 1920 if(sourceChar == CR || sourceChar == LF) { 1921 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1922 pFromU2022State->cs[2] = 0; 1923 choiceCount = 0; 1924 } 1925 1926 /* output outLen>0 bytes in buffer[] */ 1927 if(outLen == 1) { 1928 *target++ = buffer[0]; 1929 if(offsets) { 1930 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1931 } 1932 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1933 *target++ = buffer[0]; 1934 *target++ = buffer[1]; 1935 if(offsets) { 1936 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1937 *offsets++ = sourceIndex; 1938 *offsets++ = sourceIndex; 1939 } 1940 } else { 1941 fromUWriteUInt8( 1942 cnv, 1943 buffer, outLen, 1944 &target, (const char *)targetLimit, 1945 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1946 err); 1947 if(U_FAILURE(*err)) { 1948 break; 1949 } 1950 } 1951 } /* end if(myTargetIndex<myTargetLength) */ 1952 else{ 1953 *err =U_BUFFER_OVERFLOW_ERROR; 1954 break; 1955 } 1956 1957 }/* end while(mySourceIndex<mySourceLength) */ 1958 1959 /* 1960 * the end of the input stream and detection of truncated input 1961 * are handled by the framework, but for ISO-2022-JP conversion 1962 * we need to be in ASCII mode at the very end 1963 * 1964 * conditions: 1965 * successful 1966 * in SO mode or not in ASCII mode 1967 * end of input and no truncated input 1968 */ 1969 if( U_SUCCESS(*err) && 1970 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1971 args->flush && source>=sourceLimit && cnv->fromUChar32==0 1972 ) { 1973 int32_t sourceIndex; 1974 1975 outLen = 0; 1976 1977 if(pFromU2022State->g != 0) { 1978 buffer[outLen++] = UCNV_SI; 1979 pFromU2022State->g = 0; 1980 } 1981 1982 if(pFromU2022State->cs[0] != ASCII) { 1983 int32_t escLen = escSeqCharsLen[ASCII]; 1984 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 1985 outLen += escLen; 1986 pFromU2022State->cs[0] = (int8_t)ASCII; 1987 } 1988 1989 /* get the source index of the last input character */ 1990 /* 1991 * TODO this would be simpler and more reliable if we used a pair 1992 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 1993 * so that we could simply use the prevSourceIndex here; 1994 * this code gives an incorrect result for the rare case of an unmatched 1995 * trail surrogate that is alone in the last buffer of the text stream 1996 */ 1997 sourceIndex=(int32_t)(source-args->source); 1998 if(sourceIndex>0) { 1999 --sourceIndex; 2000 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2001 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2002 ) { 2003 --sourceIndex; 2004 } 2005 } else { 2006 sourceIndex=-1; 2007 } 2008 2009 fromUWriteUInt8( 2010 cnv, 2011 buffer, outLen, 2012 &target, (const char *)targetLimit, 2013 &offsets, sourceIndex, 2014 err); 2015 } 2016 2017 /*save the state and return */ 2018 args->source = source; 2019 args->target = (char*)target; 2020} 2021 2022/*************** to unicode *******************/ 2023 2024static void 2025UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2026 UErrorCode* err){ 2027 char tempBuf[2]; 2028 const char *mySource = (char *) args->source; 2029 UChar *myTarget = args->target; 2030 const char *mySourceLimit = args->sourceLimit; 2031 uint32_t targetUniChar = 0x0000; 2032 uint32_t mySourceChar = 0x0000; 2033 uint32_t tmpSourceChar = 0x0000; 2034 UConverterDataISO2022* myData; 2035 ISO2022State *pToU2022State; 2036 StateEnum cs; 2037 2038 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2039 pToU2022State = &myData->toU2022State; 2040 2041 if(myData->key != 0) { 2042 /* continue with a partial escape sequence */ 2043 goto escape; 2044 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2045 /* continue with a partial double-byte character */ 2046 mySourceChar = args->converter->toUBytes[0]; 2047 args->converter->toULength = 0; 2048 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2049 targetUniChar = missingCharMarker; 2050 goto getTrailByte; 2051 } 2052 2053 while(mySource < mySourceLimit){ 2054 2055 targetUniChar =missingCharMarker; 2056 2057 if(myTarget < args->targetLimit){ 2058 2059 mySourceChar= (unsigned char) *mySource++; 2060 2061 switch(mySourceChar) { 2062 case UCNV_SI: 2063 if(myData->version==3) { 2064 pToU2022State->g=0; 2065 continue; 2066 } else { 2067 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2068 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2069 break; 2070 } 2071 2072 case UCNV_SO: 2073 if(myData->version==3) { 2074 /* JIS7: switch to G1 half-width Katakana */ 2075 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2076 pToU2022State->g=1; 2077 continue; 2078 } else { 2079 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2080 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2081 break; 2082 } 2083 2084 case ESC_2022: 2085 mySource--; 2086escape: 2087 { 2088 const char * mySourceBefore = mySource; 2089 int8_t toULengthBefore = args->converter->toULength; 2090 2091 changeState_2022(args->converter,&(mySource), 2092 mySourceLimit, ISO_2022_JP,err); 2093 2094 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2095 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2096 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2097 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2098 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2099 } 2100 } 2101 2102 /* invalid or illegal escape sequence */ 2103 if(U_FAILURE(*err)){ 2104 args->target = myTarget; 2105 args->source = mySource; 2106 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2107 return; 2108 } 2109 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2110 if(myData->key==0) { 2111 myData->isEmptySegment = TRUE; 2112 } 2113 continue; 2114 2115 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2116 2117 case CR: 2118 /*falls through*/ 2119 case LF: 2120 /* automatically reset to single-byte mode */ 2121 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2122 pToU2022State->cs[0] = (int8_t)ASCII; 2123 } 2124 pToU2022State->cs[2] = 0; 2125 pToU2022State->g = 0; 2126 /* falls through */ 2127 default: 2128 /* convert one or two bytes */ 2129 myData->isEmptySegment = FALSE; 2130 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2131 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2132 !IS_JP_DBCS(cs) 2133 ) { 2134 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2135 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2136 2137 /* return from a single-shift state to the previous one */ 2138 if(pToU2022State->g >= 2) { 2139 pToU2022State->g=pToU2022State->prevG; 2140 } 2141 } else switch(cs) { 2142 case ASCII: 2143 if(mySourceChar <= 0x7f) { 2144 targetUniChar = mySourceChar; 2145 } 2146 break; 2147 case ISO8859_1: 2148 if(mySourceChar <= 0x7f) { 2149 targetUniChar = mySourceChar + 0x80; 2150 } 2151 /* return from a single-shift state to the previous one */ 2152 pToU2022State->g=pToU2022State->prevG; 2153 break; 2154 case ISO8859_7: 2155 if(mySourceChar <= 0x7f) { 2156 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2157 targetUniChar = 2158 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2159 myData->myConverterArray[cs], 2160 mySourceChar + 0x80); 2161 } 2162 /* return from a single-shift state to the previous one */ 2163 pToU2022State->g=pToU2022State->prevG; 2164 break; 2165 case JISX201: 2166 if(mySourceChar <= 0x7f) { 2167 targetUniChar = jisx201ToU(mySourceChar); 2168 } 2169 break; 2170 case HWKANA_7BIT: 2171 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2172 /* 7-bit halfwidth Katakana */ 2173 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2174 } 2175 break; 2176 default: 2177 /* G0 DBCS */ 2178 if(mySource < mySourceLimit) { 2179 int leadIsOk, trailIsOk; 2180 uint8_t trailByte; 2181getTrailByte: 2182 trailByte = (uint8_t)*mySource; 2183 /* 2184 * Ticket 5691: consistent illegal sequences: 2185 * - We include at least the first byte in the illegal sequence. 2186 * - If any of the non-initial bytes could be the start of a character, 2187 * we stop the illegal sequence before the first one of those. 2188 * 2189 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2190 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2191 * Otherwise we convert or report the pair of bytes. 2192 */ 2193 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2194 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2195 if (leadIsOk && trailIsOk) { 2196 ++mySource; 2197 tmpSourceChar = (mySourceChar << 8) | trailByte; 2198 if(cs == JISX208) { 2199 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2200 mySourceChar = tmpSourceChar; 2201 } else { 2202 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2203 mySourceChar = tmpSourceChar; 2204 if (cs == KSC5601) { 2205 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2206 } 2207 tempBuf[0] = (char)(tmpSourceChar >> 8); 2208 tempBuf[1] = (char)(tmpSourceChar); 2209 } 2210 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2211 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2212 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2213 ++mySource; 2214 /* add another bit so that the code below writes 2 bytes in case of error */ 2215 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2216 } 2217 } else { 2218 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2219 args->converter->toULength = 1; 2220 goto endloop; 2221 } 2222 } /* End of inner switch */ 2223 break; 2224 } /* End of outer switch */ 2225 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2226 if(args->offsets){ 2227 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2228 } 2229 *(myTarget++)=(UChar)targetUniChar; 2230 } 2231 else if(targetUniChar > missingCharMarker){ 2232 /* disassemble the surrogate pair and write to output*/ 2233 targetUniChar-=0x0010000; 2234 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2235 if(args->offsets){ 2236 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2237 } 2238 ++myTarget; 2239 if(myTarget< args->targetLimit){ 2240 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2241 if(args->offsets){ 2242 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2243 } 2244 ++myTarget; 2245 }else{ 2246 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2247 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2248 } 2249 2250 } 2251 else{ 2252 /* Call the callback function*/ 2253 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2254 break; 2255 } 2256 } 2257 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2258 *err =U_BUFFER_OVERFLOW_ERROR; 2259 break; 2260 } 2261 } 2262endloop: 2263 args->target = myTarget; 2264 args->source = mySource; 2265} 2266 2267 2268/*************************************************************** 2269* Rules for ISO-2022-KR encoding 2270* i) The KSC5601 designator sequence should appear only once in a file, 2271* at the begining of a line before any KSC5601 characters. This usually 2272* means that it appears by itself on the first line of the file 2273* ii) There are only 2 shifting sequences SO to shift into double byte mode 2274* and SI to shift into single byte mode 2275*/ 2276static void 2277UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2278 2279 UConverter* saveConv = args->converter; 2280 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2281 args->converter=myConverterData->currentConverter; 2282 2283 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2284 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2285 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2286 2287 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2288 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2289 uprv_memcpy( 2290 saveConv->charErrorBuffer, 2291 myConverterData->currentConverter->charErrorBuffer, 2292 myConverterData->currentConverter->charErrorBufferLength); 2293 } 2294 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2295 myConverterData->currentConverter->charErrorBufferLength = 0; 2296 } 2297 args->converter=saveConv; 2298} 2299 2300static void 2301UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2302 2303 const UChar *source = args->source; 2304 const UChar *sourceLimit = args->sourceLimit; 2305 unsigned char *target = (unsigned char *) args->target; 2306 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2307 int32_t* offsets = args->offsets; 2308 uint32_t targetByteUnit = 0x0000; 2309 UChar32 sourceChar = 0x0000; 2310 UBool isTargetByteDBCS; 2311 UBool oldIsTargetByteDBCS; 2312 UConverterDataISO2022 *converterData; 2313 UConverterSharedData* sharedData; 2314 UBool useFallback; 2315 int32_t length =0; 2316 2317 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2318 /* if the version is 1 then the user is requesting 2319 * conversion with ibm-25546 pass the arguments to 2320 * MBCS converter and return 2321 */ 2322 if(converterData->version==1){ 2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2324 return; 2325 } 2326 2327 /* initialize data */ 2328 sharedData = converterData->currentConverter->sharedData; 2329 useFallback = args->converter->useFallback; 2330 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2331 oldIsTargetByteDBCS = isTargetByteDBCS; 2332 2333 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2334 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2335 goto getTrail; 2336 } 2337 while(source < sourceLimit){ 2338 2339 targetByteUnit = missingCharMarker; 2340 2341 if(target < (unsigned char*) args->targetLimit){ 2342 sourceChar = *source++; 2343 2344 /* do not convert SO/SI/ESC */ 2345 if(IS_2022_CONTROL(sourceChar)) { 2346 /* callback(illegal) */ 2347 *err=U_ILLEGAL_CHAR_FOUND; 2348 args->converter->fromUChar32=sourceChar; 2349 break; 2350 } 2351 2352 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2353 if(length < 0) { 2354 length = -length; /* fallback */ 2355 } 2356 /* only DBCS or SBCS characters are expected*/ 2357 /* DB characters with high bit set to 1 are expected */ 2358 if( length > 2 || length==0 || 2359 (length == 1 && targetByteUnit > 0x7f) || 2360 (length == 2 && 2361 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2362 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2363 ) { 2364 targetByteUnit=missingCharMarker; 2365 } 2366 if (targetByteUnit != missingCharMarker){ 2367 2368 oldIsTargetByteDBCS = isTargetByteDBCS; 2369 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2370 /* append the shift sequence */ 2371 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2372 2373 if (isTargetByteDBCS) 2374 *target++ = UCNV_SO; 2375 else 2376 *target++ = UCNV_SI; 2377 if(offsets) 2378 *(offsets++) = (int32_t)(source - args->source-1); 2379 } 2380 /* write the targetUniChar to target */ 2381 if(targetByteUnit <= 0x00FF){ 2382 if( target < targetLimit){ 2383 *(target++) = (unsigned char) targetByteUnit; 2384 if(offsets){ 2385 *(offsets++) = (int32_t)(source - args->source-1); 2386 } 2387 2388 }else{ 2389 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2390 *err = U_BUFFER_OVERFLOW_ERROR; 2391 } 2392 }else{ 2393 if(target < targetLimit){ 2394 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2395 if(offsets){ 2396 *(offsets++) = (int32_t)(source - args->source-1); 2397 } 2398 if(target < targetLimit){ 2399 *(target++) =(unsigned char) (targetByteUnit -0x80); 2400 if(offsets){ 2401 *(offsets++) = (int32_t)(source - args->source-1); 2402 } 2403 }else{ 2404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2405 *err = U_BUFFER_OVERFLOW_ERROR; 2406 } 2407 }else{ 2408 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2409 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2410 *err = U_BUFFER_OVERFLOW_ERROR; 2411 } 2412 } 2413 2414 } 2415 else{ 2416 /* oops.. the code point is unassingned 2417 * set the error and reason 2418 */ 2419 2420 /*check if the char is a First surrogate*/ 2421 if(UTF_IS_SURROGATE(sourceChar)) { 2422 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2423getTrail: 2424 /*look ahead to find the trail surrogate*/ 2425 if(source < sourceLimit) { 2426 /* test the following code unit */ 2427 UChar trail=(UChar) *source; 2428 if(UTF_IS_SECOND_SURROGATE(trail)) { 2429 source++; 2430 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2431 *err = U_INVALID_CHAR_FOUND; 2432 /* convert this surrogate code point */ 2433 /* exit this condition tree */ 2434 } else { 2435 /* this is an unmatched lead code unit (1st surrogate) */ 2436 /* callback(illegal) */ 2437 *err=U_ILLEGAL_CHAR_FOUND; 2438 } 2439 } else { 2440 /* no more input */ 2441 *err = U_ZERO_ERROR; 2442 } 2443 } else { 2444 /* this is an unmatched trail code unit (2nd surrogate) */ 2445 /* callback(illegal) */ 2446 *err=U_ILLEGAL_CHAR_FOUND; 2447 } 2448 } else { 2449 /* callback(unassigned) for a BMP code point */ 2450 *err = U_INVALID_CHAR_FOUND; 2451 } 2452 2453 args->converter->fromUChar32=sourceChar; 2454 break; 2455 } 2456 } /* end if(myTargetIndex<myTargetLength) */ 2457 else{ 2458 *err =U_BUFFER_OVERFLOW_ERROR; 2459 break; 2460 } 2461 2462 }/* end while(mySourceIndex<mySourceLength) */ 2463 2464 /* 2465 * the end of the input stream and detection of truncated input 2466 * are handled by the framework, but for ISO-2022-KR conversion 2467 * we need to be in ASCII mode at the very end 2468 * 2469 * conditions: 2470 * successful 2471 * not in ASCII mode 2472 * end of input and no truncated input 2473 */ 2474 if( U_SUCCESS(*err) && 2475 isTargetByteDBCS && 2476 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2477 ) { 2478 int32_t sourceIndex; 2479 2480 /* we are switching to ASCII */ 2481 isTargetByteDBCS=FALSE; 2482 2483 /* get the source index of the last input character */ 2484 /* 2485 * TODO this would be simpler and more reliable if we used a pair 2486 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2487 * so that we could simply use the prevSourceIndex here; 2488 * this code gives an incorrect result for the rare case of an unmatched 2489 * trail surrogate that is alone in the last buffer of the text stream 2490 */ 2491 sourceIndex=(int32_t)(source-args->source); 2492 if(sourceIndex>0) { 2493 --sourceIndex; 2494 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2495 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2496 ) { 2497 --sourceIndex; 2498 } 2499 } else { 2500 sourceIndex=-1; 2501 } 2502 2503 fromUWriteUInt8( 2504 args->converter, 2505 SHIFT_IN_STR, 1, 2506 &target, (const char *)targetLimit, 2507 &offsets, sourceIndex, 2508 err); 2509 } 2510 2511 /*save the state and return */ 2512 args->source = source; 2513 args->target = (char*)target; 2514 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2515} 2516 2517/************************ To Unicode ***************************************/ 2518 2519static void 2520UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2521 UErrorCode* err){ 2522 char const* sourceStart; 2523 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2524 2525 UConverterToUnicodeArgs subArgs; 2526 int32_t minArgsSize; 2527 2528 /* set up the subconverter arguments */ 2529 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2530 minArgsSize = args->size; 2531 } else { 2532 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2533 } 2534 2535 uprv_memcpy(&subArgs, args, minArgsSize); 2536 subArgs.size = (uint16_t)minArgsSize; 2537 subArgs.converter = myData->currentConverter; 2538 2539 /* remember the original start of the input for offsets */ 2540 sourceStart = args->source; 2541 2542 if(myData->key != 0) { 2543 /* continue with a partial escape sequence */ 2544 goto escape; 2545 } 2546 2547 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2548 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2549 subArgs.source = args->source; 2550 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2551 if(subArgs.source != subArgs.sourceLimit) { 2552 /* 2553 * get the current partial byte sequence 2554 * 2555 * it needs to be moved between the public and the subconverter 2556 * so that the conversion framework, which only sees the public 2557 * converter, can handle truncated and illegal input etc. 2558 */ 2559 if(args->converter->toULength > 0) { 2560 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2561 } 2562 subArgs.converter->toULength = args->converter->toULength; 2563 2564 /* 2565 * Convert up to the end of the input, or to before the next escape character. 2566 * Does not handle conversion extensions because the preToU[] state etc. 2567 * is not copied. 2568 */ 2569 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2570 2571 if(args->offsets != NULL && sourceStart != args->source) { 2572 /* update offsets to base them on the actual start of the input */ 2573 int32_t *offsets = args->offsets; 2574 UChar *target = args->target; 2575 int32_t delta = (int32_t)(args->source - sourceStart); 2576 while(target < subArgs.target) { 2577 if(*offsets >= 0) { 2578 *offsets += delta; 2579 } 2580 ++offsets; 2581 ++target; 2582 } 2583 } 2584 args->source = subArgs.source; 2585 args->target = subArgs.target; 2586 args->offsets = subArgs.offsets; 2587 2588 /* copy input/error/overflow buffers */ 2589 if(subArgs.converter->toULength > 0) { 2590 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2591 } 2592 args->converter->toULength = subArgs.converter->toULength; 2593 2594 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2595 if(subArgs.converter->UCharErrorBufferLength > 0) { 2596 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2597 subArgs.converter->UCharErrorBufferLength); 2598 } 2599 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2600 subArgs.converter->UCharErrorBufferLength = 0; 2601 } 2602 } 2603 2604 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2605 return; 2606 } 2607 2608escape: 2609 changeState_2022(args->converter, 2610 &(args->source), 2611 args->sourceLimit, 2612 ISO_2022_KR, 2613 err); 2614 } 2615} 2616 2617static void 2618UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2619 UErrorCode* err){ 2620 char tempBuf[2]; 2621 const char *mySource = ( char *) args->source; 2622 UChar *myTarget = args->target; 2623 const char *mySourceLimit = args->sourceLimit; 2624 UChar32 targetUniChar = 0x0000; 2625 UChar mySourceChar = 0x0000; 2626 UConverterDataISO2022* myData; 2627 UConverterSharedData* sharedData ; 2628 UBool useFallback; 2629 2630 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2631 if(myData->version==1){ 2632 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2633 return; 2634 } 2635 2636 /* initialize state */ 2637 sharedData = myData->currentConverter->sharedData; 2638 useFallback = args->converter->useFallback; 2639 2640 if(myData->key != 0) { 2641 /* continue with a partial escape sequence */ 2642 goto escape; 2643 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2644 /* continue with a partial double-byte character */ 2645 mySourceChar = args->converter->toUBytes[0]; 2646 args->converter->toULength = 0; 2647 goto getTrailByte; 2648 } 2649 2650 while(mySource< mySourceLimit){ 2651 2652 if(myTarget < args->targetLimit){ 2653 2654 mySourceChar= (unsigned char) *mySource++; 2655 2656 if(mySourceChar==UCNV_SI){ 2657 myData->toU2022State.g = 0; 2658 if (myData->isEmptySegment) { 2659 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2660 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2661 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2662 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2663 args->converter->toULength = 1; 2664 args->target = myTarget; 2665 args->source = mySource; 2666 return; 2667 } 2668 /*consume the source */ 2669 continue; 2670 }else if(mySourceChar==UCNV_SO){ 2671 myData->toU2022State.g = 1; 2672 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2673 /*consume the source */ 2674 continue; 2675 }else if(mySourceChar==ESC_2022){ 2676 mySource--; 2677escape: 2678 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2679 changeState_2022(args->converter,&(mySource), 2680 mySourceLimit, ISO_2022_KR, err); 2681 if(U_FAILURE(*err)){ 2682 args->target = myTarget; 2683 args->source = mySource; 2684 return; 2685 } 2686 continue; 2687 } 2688 2689 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2690 if(myData->toU2022State.g == 1) { 2691 if(mySource < mySourceLimit) { 2692 int leadIsOk, trailIsOk; 2693 uint8_t trailByte; 2694getTrailByte: 2695 targetUniChar = missingCharMarker; 2696 trailByte = (uint8_t)*mySource; 2697 /* 2698 * Ticket 5691: consistent illegal sequences: 2699 * - We include at least the first byte in the illegal sequence. 2700 * - If any of the non-initial bytes could be the start of a character, 2701 * we stop the illegal sequence before the first one of those. 2702 * 2703 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2704 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2705 * Otherwise we convert or report the pair of bytes. 2706 */ 2707 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2708 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2709 if (leadIsOk && trailIsOk) { 2710 ++mySource; 2711 tempBuf[0] = (char)(mySourceChar + 0x80); 2712 tempBuf[1] = (char)(trailByte + 0x80); 2713 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2714 mySourceChar = (mySourceChar << 8) | trailByte; 2715 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2716 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2717 ++mySource; 2718 /* add another bit so that the code below writes 2 bytes in case of error */ 2719 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2720 } 2721 } else { 2722 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2723 args->converter->toULength = 1; 2724 break; 2725 } 2726 } 2727 else if(mySourceChar <= 0x7f) { 2728 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2729 } else { 2730 targetUniChar = 0xffff; 2731 } 2732 if(targetUniChar < 0xfffe){ 2733 if(args->offsets) { 2734 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2735 } 2736 *(myTarget++)=(UChar)targetUniChar; 2737 } 2738 else { 2739 /* Call the callback function*/ 2740 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2741 break; 2742 } 2743 } 2744 else{ 2745 *err =U_BUFFER_OVERFLOW_ERROR; 2746 break; 2747 } 2748 } 2749 args->target = myTarget; 2750 args->source = mySource; 2751} 2752 2753/*************************** END ISO2022-KR *********************************/ 2754 2755/*************************** ISO-2022-CN ********************************* 2756* 2757* Rules for ISO-2022-CN Encoding: 2758* i) The designator sequence must appear once on a line before any instance 2759* of character set it designates. 2760* ii) If two lines contain characters from the same character set, both lines 2761* must include the designator sequence. 2762* iii) Once the designator sequence is known, a shifting sequence has to be found 2763* to invoke the shifting 2764* iv) All lines start in ASCII and end in ASCII. 2765* v) Four shifting sequences are employed for this purpose: 2766* 2767* Sequcence ASCII Eq Charsets 2768* ---------- ------- --------- 2769* SI <SI> US-ASCII 2770* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2771* SS2 <ESC>N CNS-11643-1992 Plane 2 2772* SS3 <ESC>O CNS-11643-1992 Planes 3-7 2773* 2774* vi) 2775* SOdesignator : ESC "$" ")" finalchar_for_SO 2776* SS2designator : ESC "$" "*" finalchar_for_SS2 2777* SS3designator : ESC "$" "+" finalchar_for_SS3 2778* 2779* ESC $ ) A Indicates the bytes following SO are Chinese 2780* characters as defined in GB 2312-80, until 2781* another SOdesignation appears 2782* 2783* 2784* ESC $ ) E Indicates the bytes following SO are as defined 2785* in ISO-IR-165 (for details, see section 2.1), 2786* until another SOdesignation appears 2787* 2788* ESC $ ) G Indicates the bytes following SO are as defined 2789* in CNS 11643-plane-1, until another 2790* SOdesignation appears 2791* 2792* ESC $ * H Indicates the two bytes immediately following 2793* SS2 is a Chinese character as defined in CNS 2794* 11643-plane-2, until another SS2designation 2795* appears 2796* (Meaning <ESC>N must preceed every 2 byte 2797* sequence.) 2798* 2799* ESC $ + I Indicates the immediate two bytes following SS3 2800* is a Chinese character as defined in CNS 2801* 11643-plane-3, until another SS3designation 2802* appears 2803* (Meaning <ESC>O must preceed every 2 byte 2804* sequence.) 2805* 2806* ESC $ + J Indicates the immediate two bytes following SS3 2807* is a Chinese character as defined in CNS 2808* 11643-plane-4, until another SS3designation 2809* appears 2810* (In English: <ESC>O must preceed every 2 byte 2811* sequence.) 2812* 2813* ESC $ + K Indicates the immediate two bytes following SS3 2814* is a Chinese character as defined in CNS 2815* 11643-plane-5, until another SS3designation 2816* appears 2817* 2818* ESC $ + L Indicates the immediate two bytes following SS3 2819* is a Chinese character as defined in CNS 2820* 11643-plane-6, until another SS3designation 2821* appears 2822* 2823* ESC $ + M Indicates the immediate two bytes following SS3 2824* is a Chinese character as defined in CNS 2825* 11643-plane-7, until another SS3designation 2826* appears 2827* 2828* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2829* has its own designation information before any Chinese characters 2830* appear 2831* 2832*/ 2833 2834/* The following are defined this way to make the strings truely readonly */ 2835static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2836static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2837static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2838static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2839static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2840static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2841static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2842static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2843static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2844 2845/********************** ISO2022-CN Data **************************/ 2846static const char* const escSeqCharsCN[10] ={ 2847 SHIFT_IN_STR, /* ASCII */ 2848 GB_2312_80_STR, 2849 ISO_IR_165_STR, 2850 CNS_11643_1992_Plane_1_STR, 2851 CNS_11643_1992_Plane_2_STR, 2852 CNS_11643_1992_Plane_3_STR, 2853 CNS_11643_1992_Plane_4_STR, 2854 CNS_11643_1992_Plane_5_STR, 2855 CNS_11643_1992_Plane_6_STR, 2856 CNS_11643_1992_Plane_7_STR 2857}; 2858 2859static void 2860UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2861 UConverter *cnv = args->converter; 2862 UConverterDataISO2022 *converterData; 2863 ISO2022State *pFromU2022State; 2864 uint8_t *target = (uint8_t *) args->target; 2865 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2866 const UChar* source = args->source; 2867 const UChar* sourceLimit = args->sourceLimit; 2868 int32_t* offsets = args->offsets; 2869 UChar32 sourceChar; 2870 char buffer[8]; 2871 int32_t len; 2872 int8_t choices[3]; 2873 int32_t choiceCount; 2874 uint32_t targetValue = 0; 2875 UBool useFallback; 2876 2877 /* set up the state */ 2878 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2879 pFromU2022State = &converterData->fromU2022State; 2880 2881 choiceCount = 0; 2882 2883 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2884 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2885 goto getTrail; 2886 } 2887 2888 while( source < sourceLimit){ 2889 if(target < targetLimit){ 2890 2891 sourceChar = *(source++); 2892 /*check if the char is a First surrogate*/ 2893 if(UTF_IS_SURROGATE(sourceChar)) { 2894 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2895getTrail: 2896 /*look ahead to find the trail surrogate*/ 2897 if(source < sourceLimit) { 2898 /* test the following code unit */ 2899 UChar trail=(UChar) *source; 2900 if(UTF_IS_SECOND_SURROGATE(trail)) { 2901 source++; 2902 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2903 cnv->fromUChar32=0x00; 2904 /* convert this supplementary code point */ 2905 /* exit this condition tree */ 2906 } else { 2907 /* this is an unmatched lead code unit (1st surrogate) */ 2908 /* callback(illegal) */ 2909 *err=U_ILLEGAL_CHAR_FOUND; 2910 cnv->fromUChar32=sourceChar; 2911 break; 2912 } 2913 } else { 2914 /* no more input */ 2915 cnv->fromUChar32=sourceChar; 2916 break; 2917 } 2918 } else { 2919 /* this is an unmatched trail code unit (2nd surrogate) */ 2920 /* callback(illegal) */ 2921 *err=U_ILLEGAL_CHAR_FOUND; 2922 cnv->fromUChar32=sourceChar; 2923 break; 2924 } 2925 } 2926 2927 /* do the conversion */ 2928 if(sourceChar <= 0x007f ){ 2929 /* do not convert SO/SI/ESC */ 2930 if(IS_2022_CONTROL(sourceChar)) { 2931 /* callback(illegal) */ 2932 *err=U_ILLEGAL_CHAR_FOUND; 2933 cnv->fromUChar32=sourceChar; 2934 break; 2935 } 2936 2937 /* US-ASCII */ 2938 if(pFromU2022State->g == 0) { 2939 buffer[0] = (char)sourceChar; 2940 len = 1; 2941 } else { 2942 buffer[0] = UCNV_SI; 2943 buffer[1] = (char)sourceChar; 2944 len = 2; 2945 pFromU2022State->g = 0; 2946 choiceCount = 0; 2947 } 2948 if(sourceChar == CR || sourceChar == LF) { 2949 /* reset the state at the end of a line */ 2950 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2951 choiceCount = 0; 2952 } 2953 } 2954 else{ 2955 /* convert U+0080..U+10ffff */ 2956 int32_t i; 2957 int8_t cs, g; 2958 2959 if(choiceCount == 0) { 2960 /* try the current SO/G1 converter first */ 2961 choices[0] = pFromU2022State->cs[1]; 2962 2963 /* default to GB2312_1 if none is designated yet */ 2964 if(choices[0] == 0) { 2965 choices[0] = GB2312_1; 2966 } 2967 2968 if(converterData->version == 0) { 2969 /* ISO-2022-CN */ 2970 2971 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 2972 if(choices[0] == GB2312_1) { 2973 choices[1] = (int8_t)CNS_11643_1; 2974 } else { 2975 choices[1] = (int8_t)GB2312_1; 2976 } 2977 2978 choiceCount = 2; 2979 } else if (converterData->version == 1) { 2980 /* ISO-2022-CN-EXT */ 2981 2982 /* try one of the other converters */ 2983 switch(choices[0]) { 2984 case GB2312_1: 2985 choices[1] = (int8_t)CNS_11643_1; 2986 choices[2] = (int8_t)ISO_IR_165; 2987 break; 2988 case ISO_IR_165: 2989 choices[1] = (int8_t)GB2312_1; 2990 choices[2] = (int8_t)CNS_11643_1; 2991 break; 2992 default: /* CNS_11643_x */ 2993 choices[1] = (int8_t)GB2312_1; 2994 choices[2] = (int8_t)ISO_IR_165; 2995 break; 2996 } 2997 2998 choiceCount = 3; 2999 } else { 3000 choices[0] = (int8_t)CNS_11643_1; 3001 choices[1] = (int8_t)GB2312_1; 3002 } 3003 } 3004 3005 cs = g = 0; 3006 /* 3007 * len==0: no mapping found yet 3008 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3009 * len>0: found a roundtrip result, done 3010 */ 3011 len = 0; 3012 /* 3013 * We will turn off useFallback after finding a fallback, 3014 * but we still get fallbacks from PUA code points as usual. 3015 * Therefore, we will also need to check that we don't overwrite 3016 * an early fallback with a later one. 3017 */ 3018 useFallback = cnv->useFallback; 3019 3020 for(i = 0; i < choiceCount && len <= 0; ++i) { 3021 int8_t cs0 = choices[i]; 3022 if(cs0 > 0) { 3023 uint32_t value; 3024 int32_t len2; 3025 if(cs0 >= CNS_11643_0) { 3026 len2 = MBCS_FROM_UCHAR32_ISO2022( 3027 converterData->myConverterArray[CNS_11643], 3028 sourceChar, 3029 &value, 3030 useFallback, 3031 MBCS_OUTPUT_3); 3032 if(len2 == 3 || (len2 == -3 && len == 0)) { 3033 targetValue = value; 3034 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3035 if(len2 >= 0) { 3036 len = 2; 3037 } else { 3038 len = -2; 3039 useFallback = FALSE; 3040 } 3041 if(cs == CNS_11643_1) { 3042 g = 1; 3043 } else if(cs == CNS_11643_2) { 3044 g = 2; 3045 } else /* plane 3..7 */ if(converterData->version == 1) { 3046 g = 3; 3047 } else { 3048 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3049 len = 0; 3050 } 3051 } 3052 } else { 3053 /* GB2312_1 or ISO-IR-165 */ 3054 len2 = MBCS_FROM_UCHAR32_ISO2022( 3055 converterData->myConverterArray[cs0], 3056 sourceChar, 3057 &value, 3058 useFallback, 3059 MBCS_OUTPUT_2); 3060 if(len2 == 2 || (len2 == -2 && len == 0)) { 3061 targetValue = value; 3062 len = len2; 3063 cs = cs0; 3064 g = 1; 3065 useFallback = FALSE; 3066 } 3067 } 3068 } 3069 } 3070 3071 if(len != 0) { 3072 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3073 3074 /* write the designation sequence if necessary */ 3075 if(cs != pFromU2022State->cs[g]) { 3076 if(cs < CNS_11643) { 3077 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3078 } else { 3079 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3080 } 3081 len = 4; 3082 pFromU2022State->cs[g] = cs; 3083 if(g == 1) { 3084 /* changing the SO/G1 charset invalidates the choices[] */ 3085 choiceCount = 0; 3086 } 3087 } 3088 3089 /* write the shift sequence if necessary */ 3090 if(g != pFromU2022State->g) { 3091 switch(g) { 3092 case 1: 3093 buffer[len++] = UCNV_SO; 3094 3095 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3096 pFromU2022State->g = 1; 3097 break; 3098 case 2: 3099 buffer[len++] = 0x1b; 3100 buffer[len++] = 0x4e; 3101 break; 3102 default: /* case 3 */ 3103 buffer[len++] = 0x1b; 3104 buffer[len++] = 0x4f; 3105 break; 3106 } 3107 } 3108 3109 /* write the two output bytes */ 3110 buffer[len++] = (char)(targetValue >> 8); 3111 buffer[len++] = (char)targetValue; 3112 } else { 3113 /* if we cannot find the character after checking all codepages 3114 * then this is an error 3115 */ 3116 *err = U_INVALID_CHAR_FOUND; 3117 cnv->fromUChar32=sourceChar; 3118 break; 3119 } 3120 } 3121 3122 /* output len>0 bytes in buffer[] */ 3123 if(len == 1) { 3124 *target++ = buffer[0]; 3125 if(offsets) { 3126 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3127 } 3128 } else if(len == 2 && (target + 2) <= targetLimit) { 3129 *target++ = buffer[0]; 3130 *target++ = buffer[1]; 3131 if(offsets) { 3132 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3133 *offsets++ = sourceIndex; 3134 *offsets++ = sourceIndex; 3135 } 3136 } else { 3137 fromUWriteUInt8( 3138 cnv, 3139 buffer, len, 3140 &target, (const char *)targetLimit, 3141 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3142 err); 3143 if(U_FAILURE(*err)) { 3144 break; 3145 } 3146 } 3147 } /* end if(myTargetIndex<myTargetLength) */ 3148 else{ 3149 *err =U_BUFFER_OVERFLOW_ERROR; 3150 break; 3151 } 3152 3153 }/* end while(mySourceIndex<mySourceLength) */ 3154 3155 /* 3156 * the end of the input stream and detection of truncated input 3157 * are handled by the framework, but for ISO-2022-CN conversion 3158 * we need to be in ASCII mode at the very end 3159 * 3160 * conditions: 3161 * successful 3162 * not in ASCII mode 3163 * end of input and no truncated input 3164 */ 3165 if( U_SUCCESS(*err) && 3166 pFromU2022State->g!=0 && 3167 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3168 ) { 3169 int32_t sourceIndex; 3170 3171 /* we are switching to ASCII */ 3172 pFromU2022State->g=0; 3173 3174 /* get the source index of the last input character */ 3175 /* 3176 * TODO this would be simpler and more reliable if we used a pair 3177 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3178 * so that we could simply use the prevSourceIndex here; 3179 * this code gives an incorrect result for the rare case of an unmatched 3180 * trail surrogate that is alone in the last buffer of the text stream 3181 */ 3182 sourceIndex=(int32_t)(source-args->source); 3183 if(sourceIndex>0) { 3184 --sourceIndex; 3185 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3186 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3187 ) { 3188 --sourceIndex; 3189 } 3190 } else { 3191 sourceIndex=-1; 3192 } 3193 3194 fromUWriteUInt8( 3195 cnv, 3196 SHIFT_IN_STR, 1, 3197 &target, (const char *)targetLimit, 3198 &offsets, sourceIndex, 3199 err); 3200 } 3201 3202 /*save the state and return */ 3203 args->source = source; 3204 args->target = (char*)target; 3205} 3206 3207 3208static void 3209UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3210 UErrorCode* err){ 3211 char tempBuf[3]; 3212 const char *mySource = (char *) args->source; 3213 UChar *myTarget = args->target; 3214 const char *mySourceLimit = args->sourceLimit; 3215 uint32_t targetUniChar = 0x0000; 3216 uint32_t mySourceChar = 0x0000; 3217 UConverterDataISO2022* myData; 3218 ISO2022State *pToU2022State; 3219 3220 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3221 pToU2022State = &myData->toU2022State; 3222 3223 if(myData->key != 0) { 3224 /* continue with a partial escape sequence */ 3225 goto escape; 3226 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3227 /* continue with a partial double-byte character */ 3228 mySourceChar = args->converter->toUBytes[0]; 3229 args->converter->toULength = 0; 3230 targetUniChar = missingCharMarker; 3231 goto getTrailByte; 3232 } 3233 3234 while(mySource < mySourceLimit){ 3235 3236 targetUniChar =missingCharMarker; 3237 3238 if(myTarget < args->targetLimit){ 3239 3240 mySourceChar= (unsigned char) *mySource++; 3241 3242 switch(mySourceChar){ 3243 case UCNV_SI: 3244 pToU2022State->g=0; 3245 if (myData->isEmptySegment) { 3246 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3247 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3248 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3249 args->converter->toUBytes[0] = mySourceChar; 3250 args->converter->toULength = 1; 3251 args->target = myTarget; 3252 args->source = mySource; 3253 return; 3254 } 3255 continue; 3256 3257 case UCNV_SO: 3258 if(pToU2022State->cs[1] != 0) { 3259 pToU2022State->g=1; 3260 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3261 continue; 3262 } else { 3263 /* illegal to have SO before a matching designator */ 3264 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3265 break; 3266 } 3267 3268 case ESC_2022: 3269 mySource--; 3270escape: 3271 { 3272 const char * mySourceBefore = mySource; 3273 int8_t toULengthBefore = args->converter->toULength; 3274 3275 changeState_2022(args->converter,&(mySource), 3276 mySourceLimit, ISO_2022_CN,err); 3277 3278 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3279 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3280 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3281 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3282 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3283 } 3284 } 3285 3286 /* invalid or illegal escape sequence */ 3287 if(U_FAILURE(*err)){ 3288 args->target = myTarget; 3289 args->source = mySource; 3290 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3291 return; 3292 } 3293 continue; 3294 3295 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3296 3297 case CR: 3298 /*falls through*/ 3299 case LF: 3300 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3301 /* falls through */ 3302 default: 3303 /* convert one or two bytes */ 3304 myData->isEmptySegment = FALSE; 3305 if(pToU2022State->g != 0) { 3306 if(mySource < mySourceLimit) { 3307 UConverterSharedData *cnv; 3308 StateEnum tempState; 3309 int32_t tempBufLen; 3310 int leadIsOk, trailIsOk; 3311 uint8_t trailByte; 3312getTrailByte: 3313 trailByte = (uint8_t)*mySource; 3314 /* 3315 * Ticket 5691: consistent illegal sequences: 3316 * - We include at least the first byte in the illegal sequence. 3317 * - If any of the non-initial bytes could be the start of a character, 3318 * we stop the illegal sequence before the first one of those. 3319 * 3320 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3321 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3322 * Otherwise we convert or report the pair of bytes. 3323 */ 3324 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3325 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3326 if (leadIsOk && trailIsOk) { 3327 ++mySource; 3328 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3329 if(tempState >= CNS_11643_0) { 3330 cnv = myData->myConverterArray[CNS_11643]; 3331 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3332 tempBuf[1] = (char) (mySourceChar); 3333 tempBuf[2] = (char) trailByte; 3334 tempBufLen = 3; 3335 3336 }else{ 3337 cnv = myData->myConverterArray[tempState]; 3338 tempBuf[0] = (char) (mySourceChar); 3339 tempBuf[1] = (char) trailByte; 3340 tempBufLen = 2; 3341 } 3342 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3343 mySourceChar = (mySourceChar << 8) | trailByte; 3344 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3345 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3346 ++mySource; 3347 /* add another bit so that the code below writes 2 bytes in case of error */ 3348 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3349 } 3350 if(pToU2022State->g>=2) { 3351 /* return from a single-shift state to the previous one */ 3352 pToU2022State->g=pToU2022State->prevG; 3353 } 3354 } else { 3355 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3356 args->converter->toULength = 1; 3357 goto endloop; 3358 } 3359 } 3360 else{ 3361 if(mySourceChar <= 0x7f) { 3362 targetUniChar = (UChar) mySourceChar; 3363 } 3364 } 3365 break; 3366 } 3367 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3368 if(args->offsets){ 3369 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3370 } 3371 *(myTarget++)=(UChar)targetUniChar; 3372 } 3373 else if(targetUniChar > missingCharMarker){ 3374 /* disassemble the surrogate pair and write to output*/ 3375 targetUniChar-=0x0010000; 3376 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3377 if(args->offsets){ 3378 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3379 } 3380 ++myTarget; 3381 if(myTarget< args->targetLimit){ 3382 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3383 if(args->offsets){ 3384 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3385 } 3386 ++myTarget; 3387 }else{ 3388 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3389 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3390 } 3391 3392 } 3393 else{ 3394 /* Call the callback function*/ 3395 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3396 break; 3397 } 3398 } 3399 else{ 3400 *err =U_BUFFER_OVERFLOW_ERROR; 3401 break; 3402 } 3403 } 3404endloop: 3405 args->target = myTarget; 3406 args->source = mySource; 3407} 3408 3409static void 3410_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3411 UConverter *cnv = args->converter; 3412 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3413 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3414 char *p, *subchar; 3415 char buffer[8]; 3416 int32_t length; 3417 3418 subchar=(char *)cnv->subChars; 3419 length=cnv->subCharLen; /* assume length==1 for most variants */ 3420 3421 p = buffer; 3422 switch(myConverterData->locale[0]){ 3423 case 'j': 3424 { 3425 int8_t cs; 3426 3427 if(pFromU2022State->g == 1) { 3428 /* JIS7: switch from G1 to G0 */ 3429 pFromU2022State->g = 0; 3430 *p++ = UCNV_SI; 3431 } 3432 3433 cs = pFromU2022State->cs[0]; 3434 if(cs != ASCII && cs != JISX201) { 3435 /* not in ASCII or JIS X 0201: switch to ASCII */ 3436 pFromU2022State->cs[0] = (int8_t)ASCII; 3437 *p++ = '\x1b'; 3438 *p++ = '\x28'; 3439 *p++ = '\x42'; 3440 } 3441 3442 *p++ = subchar[0]; 3443 break; 3444 } 3445 case 'c': 3446 if(pFromU2022State->g != 0) { 3447 /* not in ASCII mode: switch to ASCII */ 3448 pFromU2022State->g = 0; 3449 *p++ = UCNV_SI; 3450 } 3451 *p++ = subchar[0]; 3452 break; 3453 case 'k': 3454 if(myConverterData->version == 0) { 3455 if(length == 1) { 3456 if((UBool)args->converter->fromUnicodeStatus) { 3457 /* in DBCS mode: switch to SBCS */ 3458 args->converter->fromUnicodeStatus = 0; 3459 *p++ = UCNV_SI; 3460 } 3461 *p++ = subchar[0]; 3462 } else /* length == 2*/ { 3463 if(!(UBool)args->converter->fromUnicodeStatus) { 3464 /* in SBCS mode: switch to DBCS */ 3465 args->converter->fromUnicodeStatus = 1; 3466 *p++ = UCNV_SO; 3467 } 3468 *p++ = subchar[0]; 3469 *p++ = subchar[1]; 3470 } 3471 break; 3472 } else { 3473 /* save the subconverter's substitution string */ 3474 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3475 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3476 3477 /* set our substitution string into the subconverter */ 3478 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3479 myConverterData->currentConverter->subCharLen = (int8_t)length; 3480 3481 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3482 args->converter = myConverterData->currentConverter; 3483 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3484 ucnv_cbFromUWriteSub(args, 0, err); 3485 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3486 args->converter = cnv; 3487 3488 /* restore the subconverter's substitution string */ 3489 myConverterData->currentConverter->subChars = currentSubChars; 3490 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3491 3492 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3493 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3494 uprv_memcpy( 3495 cnv->charErrorBuffer, 3496 myConverterData->currentConverter->charErrorBuffer, 3497 myConverterData->currentConverter->charErrorBufferLength); 3498 } 3499 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3500 myConverterData->currentConverter->charErrorBufferLength = 0; 3501 } 3502 return; 3503 } 3504 default: 3505 /* not expected */ 3506 break; 3507 } 3508 ucnv_cbFromUWriteBytes(args, 3509 buffer, (int32_t)(p - buffer), 3510 offsetIndex, err); 3511} 3512 3513/* 3514 * Structure for cloning an ISO 2022 converter into a single memory block. 3515 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3516 * and then ucnv_safeClone() of the sub-converter may additionally align 3517 * currentConverter inside the cloneStruct, for which we need the deadSpace 3518 * after currentConverter. 3519 * This is because UAlignedMemory may be larger than the actually 3520 * necessary alignment size for the platform. 3521 * The other cloneStruct fields will not be moved around, 3522 * and are aligned properly with cloneStruct's alignment. 3523 */ 3524struct cloneStruct 3525{ 3526 UConverter cnv; 3527 UConverter currentConverter; 3528 UAlignedMemory deadSpace; 3529 UConverterDataISO2022 mydata; 3530}; 3531 3532 3533static UConverter * 3534_ISO_2022_SafeClone( 3535 const UConverter *cnv, 3536 void *stackBuffer, 3537 int32_t *pBufferSize, 3538 UErrorCode *status) 3539{ 3540 struct cloneStruct * localClone; 3541 UConverterDataISO2022 *cnvData; 3542 int32_t i, size; 3543 3544 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3545 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3546 return NULL; 3547 } 3548 3549 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3550 localClone = (struct cloneStruct *)stackBuffer; 3551 3552 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3553 3554 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3555 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3556 localClone->cnv.isExtraLocal = TRUE; 3557 3558 /* share the subconverters */ 3559 3560 if(cnvData->currentConverter != NULL) { 3561 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3562 localClone->mydata.currentConverter = 3563 ucnv_safeClone(cnvData->currentConverter, 3564 &localClone->currentConverter, 3565 &size, status); 3566 if(U_FAILURE(*status)) { 3567 return NULL; 3568 } 3569 } 3570 3571 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3572 if(cnvData->myConverterArray[i] != NULL) { 3573 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3574 } 3575 } 3576 3577 return &localClone->cnv; 3578} 3579 3580static void 3581_ISO_2022_GetUnicodeSet(const UConverter *cnv, 3582 const USetAdder *sa, 3583 UConverterUnicodeSet which, 3584 UErrorCode *pErrorCode) 3585{ 3586 int32_t i; 3587 UConverterDataISO2022* cnvData; 3588 3589 if (U_FAILURE(*pErrorCode)) { 3590 return; 3591 } 3592#ifdef U_ENABLE_GENERIC_ISO_2022 3593 if (cnv->sharedData == &_ISO2022Data) { 3594 /* We use UTF-8 in this case */ 3595 sa->addRange(sa->set, 0, 0xd7FF); 3596 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3597 return; 3598 } 3599#endif 3600 3601 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3602 3603 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3604 switch(cnvData->locale[0]){ 3605 case 'j': 3606 /* include JIS X 0201 which is hardcoded */ 3607 sa->add(sa->set, 0xa5); 3608 sa->add(sa->set, 0x203e); 3609 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3610 /* include Latin-1 for some variants of JP */ 3611 sa->addRange(sa->set, 0, 0xff); 3612 } else { 3613 /* include ASCII for JP */ 3614 sa->addRange(sa->set, 0, 0x7f); 3615 } 3616 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3617 /* 3618 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3619 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3620 * use half-width Katakana. 3621 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3622 * half-width Katakana via the ESC ( I sequence. 3623 * However, we only emit (fromUnicode) half-width Katakana according to the 3624 * definition of each variant. 3625 * 3626 * When including fallbacks, 3627 * we need to include half-width Katakana Unicode code points for all JP variants because 3628 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3629 */ 3630 /* include half-width Katakana for JP */ 3631 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3632 } 3633 break; 3634 case 'c': 3635 case 'z': 3636 /* include ASCII for CN */ 3637 sa->addRange(sa->set, 0, 0x7f); 3638 break; 3639 case 'k': 3640 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3641 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3642 cnvData->currentConverter, sa, which, pErrorCode); 3643 /* the loop over myConverterArray[] will simply not find another converter */ 3644 break; 3645 default: 3646 break; 3647 } 3648 3649#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3650 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3651 cnvData->version==0 && i==CNS_11643 3652 ) { 3653 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3654 ucnv_MBCSGetUnicodeSetForBytes( 3655 cnvData->myConverterArray[i], 3656 sa, UCNV_ROUNDTRIP_SET, 3657 0, 0x81, 0x82, 3658 pErrorCode); 3659 } 3660#endif 3661 3662 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3663 UConverterSetFilter filter; 3664 if(cnvData->myConverterArray[i]!=NULL) { 3665 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3666 cnvData->version==0 && i==CNS_11643 3667 ) { 3668 /* 3669 * Version-specific for CN: 3670 * CN version 0 does not map CNS planes 3..7 although 3671 * they are all available in the CNS conversion table; 3672 * CN version 1 (-EXT) does map them all. 3673 * The two versions create different Unicode sets. 3674 */ 3675 filter=UCNV_SET_FILTER_2022_CN; 3676 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3677 /* 3678 * Only add code points that map to Shift-JIS codes 3679 * corresponding to JIS X 0208. 3680 */ 3681 filter=UCNV_SET_FILTER_SJIS; 3682 } else if(i==KSC5601) { 3683 /* 3684 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3685 * are broader than GR94. 3686 */ 3687 filter=UCNV_SET_FILTER_GR94DBCS; 3688 } else { 3689 filter=UCNV_SET_FILTER_NONE; 3690 } 3691 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3692 } 3693 } 3694 3695 /* 3696 * ISO 2022 converters must not convert SO/SI/ESC despite what 3697 * sub-converters do by themselves. 3698 * Remove these characters from the set. 3699 */ 3700 sa->remove(sa->set, 0x0e); 3701 sa->remove(sa->set, 0x0f); 3702 sa->remove(sa->set, 0x1b); 3703 3704 /* ISO 2022 converters do not convert C1 controls either */ 3705 sa->removeRange(sa->set, 0x80, 0x9f); 3706} 3707 3708static const UConverterImpl _ISO2022Impl={ 3709 UCNV_ISO_2022, 3710 3711 NULL, 3712 NULL, 3713 3714 _ISO2022Open, 3715 _ISO2022Close, 3716 _ISO2022Reset, 3717 3718#ifdef U_ENABLE_GENERIC_ISO_2022 3719 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3720 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3721 ucnv_fromUnicode_UTF8, 3722 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3723#else 3724 NULL, 3725 NULL, 3726 NULL, 3727 NULL, 3728#endif 3729 NULL, 3730 3731 NULL, 3732 _ISO2022getName, 3733 _ISO_2022_WriteSub, 3734 _ISO_2022_SafeClone, 3735 _ISO_2022_GetUnicodeSet 3736}; 3737static const UConverterStaticData _ISO2022StaticData={ 3738 sizeof(UConverterStaticData), 3739 "ISO_2022", 3740 2022, 3741 UCNV_IBM, 3742 UCNV_ISO_2022, 3743 1, 3744 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3745 { 0x1a, 0, 0, 0 }, 3746 1, 3747 FALSE, 3748 FALSE, 3749 0, 3750 0, 3751 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3752}; 3753const UConverterSharedData _ISO2022Data={ 3754 sizeof(UConverterSharedData), 3755 ~((uint32_t) 0), 3756 NULL, 3757 NULL, 3758 &_ISO2022StaticData, 3759 FALSE, 3760 &_ISO2022Impl, 3761 0 3762}; 3763 3764/*************JP****************/ 3765static const UConverterImpl _ISO2022JPImpl={ 3766 UCNV_ISO_2022, 3767 3768 NULL, 3769 NULL, 3770 3771 _ISO2022Open, 3772 _ISO2022Close, 3773 _ISO2022Reset, 3774 3775 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3776 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3777 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3778 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3779 NULL, 3780 3781 NULL, 3782 _ISO2022getName, 3783 _ISO_2022_WriteSub, 3784 _ISO_2022_SafeClone, 3785 _ISO_2022_GetUnicodeSet 3786}; 3787static const UConverterStaticData _ISO2022JPStaticData={ 3788 sizeof(UConverterStaticData), 3789 "ISO_2022_JP", 3790 0, 3791 UCNV_IBM, 3792 UCNV_ISO_2022, 3793 1, 3794 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3795 { 0x1a, 0, 0, 0 }, 3796 1, 3797 FALSE, 3798 FALSE, 3799 0, 3800 0, 3801 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3802}; 3803static const UConverterSharedData _ISO2022JPData={ 3804 sizeof(UConverterSharedData), 3805 ~((uint32_t) 0), 3806 NULL, 3807 NULL, 3808 &_ISO2022JPStaticData, 3809 FALSE, 3810 &_ISO2022JPImpl, 3811 0 3812}; 3813 3814/************* KR ***************/ 3815static const UConverterImpl _ISO2022KRImpl={ 3816 UCNV_ISO_2022, 3817 3818 NULL, 3819 NULL, 3820 3821 _ISO2022Open, 3822 _ISO2022Close, 3823 _ISO2022Reset, 3824 3825 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3826 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3827 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3828 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3829 NULL, 3830 3831 NULL, 3832 _ISO2022getName, 3833 _ISO_2022_WriteSub, 3834 _ISO_2022_SafeClone, 3835 _ISO_2022_GetUnicodeSet 3836}; 3837static const UConverterStaticData _ISO2022KRStaticData={ 3838 sizeof(UConverterStaticData), 3839 "ISO_2022_KR", 3840 0, 3841 UCNV_IBM, 3842 UCNV_ISO_2022, 3843 1, 3844 3, /* max 3 bytes per UChar: SO+DBCS */ 3845 { 0x1a, 0, 0, 0 }, 3846 1, 3847 FALSE, 3848 FALSE, 3849 0, 3850 0, 3851 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3852}; 3853static const UConverterSharedData _ISO2022KRData={ 3854 sizeof(UConverterSharedData), 3855 ~((uint32_t) 0), 3856 NULL, 3857 NULL, 3858 &_ISO2022KRStaticData, 3859 FALSE, 3860 &_ISO2022KRImpl, 3861 0 3862}; 3863 3864/*************** CN ***************/ 3865static const UConverterImpl _ISO2022CNImpl={ 3866 3867 UCNV_ISO_2022, 3868 3869 NULL, 3870 NULL, 3871 3872 _ISO2022Open, 3873 _ISO2022Close, 3874 _ISO2022Reset, 3875 3876 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3877 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3878 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3879 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3880 NULL, 3881 3882 NULL, 3883 _ISO2022getName, 3884 _ISO_2022_WriteSub, 3885 _ISO_2022_SafeClone, 3886 _ISO_2022_GetUnicodeSet 3887}; 3888static const UConverterStaticData _ISO2022CNStaticData={ 3889 sizeof(UConverterStaticData), 3890 "ISO_2022_CN", 3891 0, 3892 UCNV_IBM, 3893 UCNV_ISO_2022, 3894 1, 3895 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3896 { 0x1a, 0, 0, 0 }, 3897 1, 3898 FALSE, 3899 FALSE, 3900 0, 3901 0, 3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3903}; 3904static const UConverterSharedData _ISO2022CNData={ 3905 sizeof(UConverterSharedData), 3906 ~((uint32_t) 0), 3907 NULL, 3908 NULL, 3909 &_ISO2022CNStaticData, 3910 FALSE, 3911 &_ISO2022CNImpl, 3912 0 3913}; 3914 3915 3916 3917#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3918