1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2008-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9package com.ibm.icu.charset; 10 11import java.nio.ByteBuffer; 12import java.nio.CharBuffer; 13import java.nio.IntBuffer; 14import java.nio.charset.CharsetDecoder; 15import java.nio.charset.CharsetEncoder; 16import java.nio.charset.CoderResult; 17import java.util.Arrays; 18 19import com.ibm.icu.charset.CharsetMBCS.CharsetDecoderMBCS; 20import com.ibm.icu.charset.CharsetMBCS.CharsetEncoderMBCS; 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.text.UTF16; 23import com.ibm.icu.text.UnicodeSet; 24 25class CharsetISO2022 extends CharsetICU { 26 private UConverterDataISO2022 myConverterData; 27 private int variant; // one of enum {ISO_2022_JP, ISO_2022_KR, or ISO_2022_CN} 28 29 private static final byte[] SHIFT_IN_STR = { 0x0f }; 30// private static final byte[] SHIFT_OUT_STR = { 0x0e }; 31 32 private static final byte CR = 0x0D; 33 private static final byte LF = 0x0A; 34/* 35 private static final byte H_TAB = 0x09; 36 private static final byte SPACE = 0x20; 37*/ 38 private static final char HWKANA_START = 0xff61; 39 private static final char HWKANA_END = 0xff9f; 40 41 /* 42 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 43 * as bytes 21..7E. (Subtract 0x80.) 44 * 96-character sets with native bit values A0..FF are encoded in ISO 2022 45 * as bytes 20..7F. (Subtract 0x80.) 46 * Do not encode C1 control codes with native bytes 80..9F 47 * as bytes 00..1F (C0 control codes). 48 */ 49/* 50 private static final char GR94_START = 0xa1; 51 private static final char GR94_END = 0xfe; 52*/ 53 private static final char GR96_START = 0xa0; 54 private static final char GR96_END = 0xff; 55 56 /* for ISO-2022-JP and -CN implementations */ 57 // typedef enum { 58 /* shared values */ 59 private static final byte INVALID_STATE = -1; 60 private static final byte ASCII = 0; 61 62 private static final byte SS2_STATE = 0x10; 63 private static final byte SS3_STATE = 0x11; 64 65 /* JP */ 66 private static final byte ISO8859_1 = 1; 67 private static final byte ISO8859_7 = 2; 68 private static final byte JISX201 = 3; 69 private static final byte JISX208 = 4; 70 private static final byte JISX212 = 5; 71 private static final byte GB2312 = 6; 72 private static final byte KSC5601 = 7; 73 private static final byte HWKANA_7BIT = 8; /* Halfwidth Katakana 7 bit */ 74 75 /* CN */ 76 /* the first few enum constants must keep their values because they corresponds to myConverterArray[] */ 77 private static final byte GB2312_1 = 1; 78 private static final byte ISO_IR_165= 2; 79 private static final byte CNS_11643 = 3; 80 81 /* 82 * these are used in StateEnum and ISO2022State variables, 83 * but CNS_11643 must be used to index into myConverterArray[] 84 */ 85 private static final byte CNS_11643_0 = 0x20; 86 private static final byte CNS_11643_1 = 0x21; 87 private static final byte CNS_11643_2 = 0x22; 88 private static final byte CNS_11643_3 = 0x23; 89 private static final byte CNS_11643_4 = 0x24; 90 private static final byte CNS_11643_5 = 0x25; 91 private static final byte CNS_11643_6 = 0x26; 92 private static final byte CNS_11643_7 = 0x27; 93 // } StateEnum; 94 95 96 public CharsetISO2022(String icuCanonicalName, String javaCanonicalName, String[] aliases) { 97 super(icuCanonicalName, javaCanonicalName, aliases); 98 99 myConverterData = new UConverterDataISO2022(); 100 101 int versionIndex = icuCanonicalName.indexOf("version="); 102 int version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue(); 103 104 myConverterData.version = version; 105 106 if (icuCanonicalName.indexOf("locale=ja") > 0) { 107 ISO2022InitJP(version); 108 } else if (icuCanonicalName.indexOf("locale=zh") > 0) { 109 ISO2022InitCN(version); 110 } else /* if (icuCanonicalName.indexOf("locale=ko") > 0) */ { 111 ISO2022InitKR(version); 112 } 113 114 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder(); 115 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder(); 116 } 117 118 private void ISO2022InitJP(int version) { 119 variant = ISO_2022_JP; 120 121 maxBytesPerChar = 6; 122 minBytesPerChar = 1; 123 maxCharsPerByte = 1; 124 // open the required converters and cache them 125 if((jpCharsetMasks[version]&CSM(ISO8859_7)) != 0) { 126 myConverterData.myConverterArray[ISO8859_7] = ((CharsetMBCS)CharsetICU.forNameICU("ISO8859_7")).sharedData; 127 } 128 // myConverterData.myConverterArray[JISX201] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-201")).sharedData; 129 myConverterData.myConverterArray[JISX208] = ((CharsetMBCS)CharsetICU.forNameICU("Shift-JIS")).sharedData; 130 if ((jpCharsetMasks[version]&CSM(JISX212)) != 0) { 131 myConverterData.myConverterArray[JISX212] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-212")).sharedData; 132 } 133 if ((jpCharsetMasks[version]&CSM(GB2312)) != 0) { 134 myConverterData.myConverterArray[GB2312] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData; 135 } 136 if ((jpCharsetMasks[version]&CSM(KSC5601)) != 0) { 137 myConverterData.myConverterArray[KSC5601] = ((CharsetMBCS)CharsetICU.forNameICU("ksc_5601")).sharedData; 138 } 139 140 // create a generic CharsetMBCS object 141 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546"); 142 } 143 144 private void ISO2022InitCN(int version) { 145 variant = ISO_2022_CN; 146 147 maxBytesPerChar = 8; 148 minBytesPerChar = 1; 149 maxCharsPerByte = 1; 150 // open the required coverters and cache them. 151 myConverterData.myConverterArray[GB2312_1] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData; 152 if (version == 1) { 153 myConverterData.myConverterArray[ISO_IR_165] = ((CharsetMBCS)CharsetICU.forNameICU("iso-ir-165")).sharedData; 154 } 155 myConverterData.myConverterArray[CNS_11643] = ((CharsetMBCS)CharsetICU.forNameICU("cns-11643-1992")).sharedData; 156 157 // create a generic CharsetMBCS object 158 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546"); 159 } 160 161 private void ISO2022InitKR(int version) { 162 variant = ISO_2022_KR; 163 164 maxBytesPerChar = 8; 165 minBytesPerChar = 1; 166 maxCharsPerByte = 1; 167 168 if (version == 1) { 169 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546"); 170 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0]; 171 } else { 172 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("ibm-949"); 173 } 174 175 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder(); 176 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder(); 177 } 178 179 /* 180 * ISO 2022 control codes must not be converted from Unicode 181 * because they would mess up the byte stream. 182 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 183 * corresponding to SO, SI, and ESC. 184 */ 185 private static boolean IS_2022_CONTROL(int c) { 186 return (c<0x20) && (((1<<c) & 0x0800c000) != 0); 187 } 188 189 /* 190 * Check that the result is a 2-byte value with each byte in the range A1..FE 191 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 192 * to move it to the ISO 2022 range 21..7E. 193 * return 0 if out of range. 194 */ 195 private static int _2022FromGR94DBCS(int value) { 196 if ((value <= 0xfefe && value >= 0xa1a1) && 197 ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) { 198 return (value - 0x8080); /* shift down to 21..7e byte range */ 199 } else { 200 return 0; /* not valid for ISO 2022 */ 201 } 202 } 203 204 /* 205 * Commented out because Ticket 5691: Call sites now check for validity. They can just += 0x8080 after that. 206 * 207 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 208 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 209 * unchanged. 210 * 211 private static int _2022ToGR94DBCS(int value) { 212 int returnValue = value + 0x8080; 213 214 if ((returnValue <= 0xfefe && returnValue >= 0xa1a1) && 215 ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) { 216 return returnValue; 217 } else { 218 return value; 219 } 220 }*/ 221 222 /* is the StateEnum charset value for a DBCS charset? */ 223 private static boolean IS_JP_DBCS(byte cs) { 224 return ((JISX208 <= cs) && (cs <= KSC5601)); 225 } 226 227 private static short CSM(short cs) { 228 return (short)(1<<cs); 229 } 230 231 /* This gets the valid index of the end of buffer when decoding. */ 232 private static int getEndOfBuffer_2022(ByteBuffer source) { 233 int sourceIndex = source.position(); 234 byte mySource = 0; 235 mySource = source.get(sourceIndex); 236 237 while (source.hasRemaining() && mySource != ESC_2022) { 238 mySource = source.get(); 239 if (mySource == ESC_2022) { 240 break; 241 } 242 sourceIndex++; 243 } 244 return sourceIndex; 245 } 246 247 /* 248 * This is a simple version of _MBCSGetNextUChar() calls the method in CharsetDecoderMBCS and returns 249 * the value given. 250 * 251 * Return value: 252 * U+fffe unassigned 253 * U+ffff illegal 254 * otherwise the Unicode code point 255 */ 256 private int MBCSSimpleGetNextUChar(UConverterSharedData sharedData, 257 ByteBuffer source, 258 boolean useFallback) { 259 int returnValue; 260 UConverterSharedData tempSharedData = myConverterData.currentConverter.sharedData; 261 myConverterData.currentConverter.sharedData = sharedData; 262 returnValue = myConverterData.currentDecoder.simpleGetNextUChar(source, useFallback); 263 myConverterData.currentConverter.sharedData = tempSharedData; 264 265 return returnValue; 266 } 267 268 /* 269 * @param is the the output byte 270 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 271 */ 272 static int MBCSSingleFromUChar32(UConverterSharedData sharedData, int c, int[] retval, boolean useFallback) { 273 char[] table; 274 int value; 275 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 276 if (c >= 0x10000 && !sharedData.mbcs.hasSupplementary()) { 277 return 0; 278 } 279 /* convert the Unicode code point in c into codepage bytes */ 280 table = sharedData.mbcs.fromUnicodeTable; 281 /* get the byte for the output */ 282 value = CharsetMBCS.MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeChars, c); 283 /* get the byte for the output */ 284 retval[0] = value & 0xff; 285 if (value >= 0xf00) { 286 return 1; /* roundtrip */ 287 } else if (useFallback ? value>=0x800 : value>=0xc00) { 288 return -1; /* fallback taken */ 289 } else { 290 return 0; /* no mapping */ 291 } 292 } 293 294 /* 295 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 296 * to whether that charset is used in the corresponding version x of ISO_2022, locale=ja,version=x 297 * 298 * Note: The converter uses some leniency: 299 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 300 * all versions, not just JIS7 and JIS8. 301 * - ICU does not distinguish between different version so of JIS X 0208. 302 */ 303 private static final short jpCharsetMasks[] = { 304 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)), 305 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)), 306 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)), 307 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)), 308 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)) 309 }; 310 311/* 312 // typedef enum { 313 private static final byte ASCII1 = 0; 314 private static final byte LATIN1 = 1; 315 private static final byte SBCS = 2; 316 private static final byte DBCS = 3; 317 private static final byte MBCS = 4; 318 private static final byte HWKANA = 5; 319 // } Cnv2002Type; 320*/ 321 322 private static class ISO2022State { 323 private byte []cs; /* Charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 324 private byte g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 325 private byte prevG; /* g before single shift (SS2 or SS3) */ 326 327 ISO2022State() { 328 cs = new byte[4]; 329 } 330 331 void reset() { 332 Arrays.fill(cs, (byte)0); 333 g = 0; 334 prevG = 0; 335 } 336 } 337 338// private static final byte UCNV_OPTIONS_VERSION_MASK = 0xf; 339 private static final byte UCNV_2022_MAX_CONVERTERS = 10; 340 341 private static class UConverterDataISO2022 { 342 UConverterSharedData []myConverterArray; 343 CharsetEncoderMBCS currentEncoder; 344 CharsetDecoderMBCS currentDecoder; 345 CharsetMBCS currentConverter; 346 ISO2022State toU2022State; 347 ISO2022State fromU2022State; 348 int key; 349 int version; 350 boolean isEmptySegment; 351 352 UConverterDataISO2022() { 353 myConverterArray = new UConverterSharedData[UCNV_2022_MAX_CONVERTERS]; 354 toU2022State = new ISO2022State(); 355 fromU2022State = new ISO2022State(); 356 key = 0; 357 version = 0; 358 isEmptySegment = false; 359 } 360 361 void reset() { 362 toU2022State.reset(); 363 fromU2022State.reset(); 364 isEmptySegment = false; 365 } 366 } 367 368 private static final byte ESC_2022 = 0x1B; /* ESC */ 369 370 // typedef enum { 371 private static final byte INVALID_2022 = -1; /* Doesn't correspond to a valid iso 2022 escape sequence */ 372 private static final byte VALID_NON_TERMINAL_2022 = 0; /* so far corresponds to a valid iso 2022 escape sequence */ 373 private static final byte VALID_TERMINAL_2022 = 1; /* corresponds to a valid iso 2022 escape sequence */ 374 private static final byte VALID_MAYBE_TERMINAL_2022 = 2; /* so far matches one iso 2022 escape sequence, but by adding 375 more characters might match another escape sequence */ 376 // } UCNV_TableStates_2022; 377 378 /* 379 * The way these state transition arrays work is: 380 * ex : ESC$B is the sequence for JISX208 381 * a) First Iteration: char is ESC 382 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 383 * int x = normalize_esq_chars_2022[27] which is equal to 1 384 * ii) Search for this value in escSeqStateTable_Key_2022[] 385 * value of x is stored at escSeqStateTable_Key_2022[0] 386 * iii) Save this index as offset 387 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 388 * escSeqStateTable_value_2022[offset], which is VALID_NON_TERMINAL_2022 389 * b) Switch on this state and continue to next char 390 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 391 * which is normalize_esq_chars_2022[36] == 4 392 * ii) x is currently 1(from above) 393 * x<<=5 -- x is now 32 394 * x+=normalize_esq_chars_2022[36] 395 * now x is 36 396 * iii) Search for this value in escSeqStateTable_Key_2022[] 397 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 398 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 399 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 400 * c) Switch on this state and continue to next char 401 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 402 * ii) x is currently 36 (from above) 403 * x<<=5 -- x is now 1152 404 * x+= normalize_esq_chars_2022[66] 405 * now x is 1161 406 * iii) Search for this value in escSeqStateTable_Key_2022[] 407 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 408 * iv) Get state of this sequence from escSeqStateTable_Value_2022[1] 409 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 410 * v) Get the converter name from escSeqStateTable_Result_2022[21] which is JISX208 411 */ 412 /* Below are the 3 arrays depicting a state transition table */ 413 private static final byte normalize_esq_chars_2022[] = { 414 /* 0 1 2 3 4 5 6 7 8 9 */ 415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 417 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 418 0, 0, 0, 0, 0, 0, 4, 7, 29, 0, 419 2, 24, 26, 27, 0, 3, 23, 6, 0, 0, 420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 421 0, 0, 0, 0, 5, 8, 9, 10, 11, 12, 422 13, 14, 15, 16, 17, 18, 19, 20, 25, 28, 423 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 424 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 441 0, 0, 0, 0, 0, 0 442 }; 443 444 private static final short MAX_STATES_2022 = 74; 445 private static final int escSeqStateTable_Key_2022[/* MAX_STATES_2022 */] = { 446 /* 0 1 2 3 4 5 6 7 8 9 */ 447 1, 34, 36, 39, 55, 57, 60, 61, 1093, 1096, 448 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 449 1109, 1154, 1157, 1160, 1161, 1176, 1178, 1179, 1254, 1257, 450 1768, 1773, 1957, 35105, 36933, 36936, 36937, 36938, 36939, 36940, 451 36942, 36943, 36944, 36945, 36946, 36947, 36948, 37640, 37642, 37644, 452 37646, 37711, 37744, 37745, 37746, 37747, 37748, 40133, 40136, 40138, 453 40139, 40140, 40141, 1123363, 35947624, 35947625, 35947626, 35947627, 35947629, 35947630, 454 35947631, 35947635, 35947636, 35947638 455 }; 456 457 private static final byte escSeqStateTable_Value_2022[/* MAX_STATES_2022 */] = { 458 /* 0 1 2 3 4 */ 459 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, 460 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 461 VALID_MAYBE_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 462 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 463 VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 464 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, 465 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, 466 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 467 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 468 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 469 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 470 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 471 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, 472 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, 473 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022 474 }; 475 476 /* Type def for refactoring changeState_2022 code */ 477 // typedef enum { 478 private static final byte ISO_2022_JP = 1; 479 private static final byte ISO_2022_KR = 2; 480 private static final byte ISO_2022_CN = 3; 481 // } Variant2022; 482 483 /* const UConverterSharedData _ISO2022Data; */ 484 //private UConverterSharedData _ISO2022JPData; 485 //private UConverterSharedData _ISO2022KRData; 486 //private UConverterSharedData _ISO2022CNData; 487 488 /******************** to unicode ********************/ 489 /**************************************************** 490 * Recognized escape sequenes are 491 * <ESC>(B ASCII 492 * <ESC>.A ISO-8859-1 493 * <ESC>.F ISO-8859-7 494 * <ESC>(J JISX-201 495 * <ESC>(I JISX-201 496 * <ESC>$B JISX-208 497 * <ESC>$@ JISX-208 498 * <ESC>$(D JISX-212 499 * <ESC>$A GB2312 500 * <ESC>$(C KSC5601 501 */ 502 private final static byte nextStateToUnicodeJP[/* MAX_STATES_2022 */] = { 503 /* 0 1 2 3 4 5 6 7 8 9 */ 504 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 505 ASCII, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, JISX201, HWKANA_7BIT, JISX201, INVALID_STATE, 506 INVALID_STATE, INVALID_STATE, JISX208, GB2312, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 507 ISO8859_1, ISO8859_7, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, KSC5601, JISX212, INVALID_STATE, 508 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 509 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 510 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 511 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE 512 }; 513 514 private final static byte nextStateToUnicodeCN[/* MAX_STATES_2022 */] = { 515 /* 0 1 2 3 4 5 6 7 8 9 */ 516 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, SS3_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 517 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 518 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 519 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 520 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, GB2312_1, INVALID_STATE, ISO_IR_165, 521 CNS_11643_1, CNS_11643_2, CNS_11643_3, CNS_11643_4, CNS_11643_5, CNS_11643_6, CNS_11643_7, INVALID_STATE, INVALID_STATE, INVALID_STATE, 522 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, 523 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE 524 }; 525 526 /* runs through a state machine to determine the escape sequence - codepage correspondence */ 527 @SuppressWarnings("fallthrough") 528 private CoderResult changeState_2022(CharsetDecoderICU decoder, ByteBuffer source, int var) { 529 CoderResult err = CoderResult.UNDERFLOW; 530 boolean DONE = false; 531 byte value; 532 int key[] = {myConverterData.key}; 533 int offset[] = {0}; 534 int initialToULength = decoder.toULength; 535 byte c; 536 int malformLength = 0; 537 538 value = VALID_NON_TERMINAL_2022; 539 while (source.hasRemaining()) { 540 c = source.get(); 541 malformLength++; 542 decoder.toUBytesArray[decoder.toULength++] = c; 543 value = getKey_2022(c, key, offset); 544 545 switch(value) { 546 547 case VALID_NON_TERMINAL_2022: 548 /* continue with the loop */ 549 break; 550 551 case VALID_TERMINAL_2022: 552 key[0] = 0; 553 DONE = true; 554 break; 555 556 case INVALID_2022: 557 DONE = true; 558 break; 559 560 case VALID_MAYBE_TERMINAL_2022: 561 /* not ISO_2022 itself, finish here */ 562 value = VALID_TERMINAL_2022; 563 key[0] = 0; 564 DONE = true; 565 break; 566 } 567 if (DONE) { 568 break; 569 } 570 } 571// DONE: 572 myConverterData.key = key[0]; 573 574 if (value == VALID_NON_TERMINAL_2022) { 575 /* indicate that the escape sequence is incomplete: key !=0 */ 576 return err; 577 } else if (value == INVALID_2022) { 578 err = CoderResult.malformedForLength(malformLength); 579 } else /* value == VALID_TERMINAL_2022 */ { 580 switch (var) { 581 case ISO_2022_JP: { 582 byte tempState = nextStateToUnicodeJP[offset[0]]; 583 switch (tempState) { 584 case INVALID_STATE: 585 err = CoderResult.malformedForLength(malformLength); 586 break; 587 case SS2_STATE: 588 if (myConverterData.toU2022State.cs[2] != 0) { 589 if (myConverterData.toU2022State.g < 2) { 590 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g; 591 } 592 myConverterData.toU2022State.g = 2; 593 } else { 594 /* illegal to have SS2 before a matching designator */ 595 err = CoderResult.malformedForLength(malformLength); 596 } 597 break; 598 /* case SS3_STATE: not used in ISO-2022-JP-x */ 599 case ISO8859_1: 600 case ISO8859_7: 601 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) { 602 err = CoderResult.unmappableForLength(malformLength); 603 } else { 604 /* G2 charset for SS2 */ 605 myConverterData.toU2022State.cs[2] = tempState; 606 } 607 break; 608 default: 609 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) { 610 err = CoderResult.unmappableForLength(source.position() - 1); 611 } else { 612 /* G0 charset */ 613 myConverterData.toU2022State.cs[0] = tempState; 614 } 615 break; 616 } // end of switch 617 break; 618 } 619 case ISO_2022_CN: { 620 byte tempState = nextStateToUnicodeCN[offset[0]]; 621 switch (tempState) { 622 case INVALID_STATE: 623 err = CoderResult.unmappableForLength(malformLength); 624 break; 625 case SS2_STATE: 626 if (myConverterData.toU2022State.cs[2] != 0) { 627 if (myConverterData.toU2022State.g < 2) { 628 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g; 629 } 630 myConverterData.toU2022State.g = 2; 631 } else { 632 /* illegal to have SS2 before a matching designator */ 633 err = CoderResult.malformedForLength(malformLength); 634 } 635 break; 636 case SS3_STATE: 637 if (myConverterData.toU2022State.cs[3] != 0) { 638 if (myConverterData.toU2022State.g < 2) { 639 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g; 640 } 641 myConverterData.toU2022State.g = 3; 642 } else { 643 /* illegal to have SS3 before a matching designator */ 644 err = CoderResult.malformedForLength(malformLength); 645 } 646 break; 647 case ISO_IR_165: 648 if (myConverterData.version == 0) { 649 err = CoderResult.unmappableForLength(malformLength); 650 break; 651 } 652 /* fall through */ 653 case GB2312_1: 654 /* fall through */ 655 case CNS_11643_1: 656 myConverterData.toU2022State.cs[1] = tempState; 657 break; 658 case CNS_11643_2: 659 myConverterData.toU2022State.cs[2] = tempState; 660 break; 661 default: 662 /* other CNS 11643 planes */ 663 if (myConverterData.version == 0) { 664 err = CoderResult.unmappableForLength(source.position() - 1); 665 } else { 666 myConverterData.toU2022State.cs[3] = tempState; 667 } 668 break; 669 } //end of switch 670 } 671 break; 672 case ISO_2022_KR: 673 if (offset[0] == 0x30) { 674 /* nothing to be done, just accept this one escape sequence */ 675 } else { 676 err = CoderResult.unmappableForLength(malformLength); 677 } 678 break; 679 default: 680 err = CoderResult.malformedForLength(malformLength); 681 break; 682 } // end of switch 683 } 684 if (!err.isError()) { 685 decoder.toULength = 0; 686 } else if (err.isMalformed()) { 687 if (decoder.toULength > 1) { 688 /* 689 * Ticket 5691: consistent illegal sequences: 690 * - We include at least the first byte (ESC) in the illegal sequence. 691 * - If any of the non-initial bytes could be the start of a character, 692 * we stop the illegal sequece before the first one of those. 693 * In escape sequences, all following bytes are "printable", that is, 694 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 695 * they are valid single/lead bytes. 696 * For simplicity, we always only report the initial ESC byte as the 697 * illegal sequence and back out all other bytes we looked at. 698 */ 699 /* Back out some bytes. */ 700 int backOutDistance = decoder.toULength - 1; 701 int bytesFromThisBuffer = decoder.toULength - initialToULength; 702 if (backOutDistance <= bytesFromThisBuffer) { 703 /* same as initialToULength<=1 */ 704 source.position(source.position() - backOutDistance); 705 } else { 706 /* Back out bytes from the previous buffer: Need to replay them. */ 707 decoder.preToULength = (byte)(bytesFromThisBuffer - backOutDistance); 708 /* same as -(initalToULength-1) */ 709 /* preToULength is negative! */ 710 for (int i = 0; i < -(decoder.preToULength); i++) { 711 decoder.preToUArray[i] = decoder.toUBytesArray[i+1]; 712 } 713 source.position(source.position() - bytesFromThisBuffer); 714 } 715 decoder.toULength = 1; 716 } 717 } 718 719 return err; 720 } 721 722 private static byte getKey_2022(byte c, int[]key, int[]offset) { 723 int togo; 724 int low = 0; 725 int hi = MAX_STATES_2022; 726 int oldmid = 0; 727 728 togo = normalize_esq_chars_2022[c&UConverterConstants.UNSIGNED_BYTE_MASK]; 729 730 if (togo == 0) { 731 /* not a valid character anywhere in an escape sequence */ 732 key[0] = 0; 733 offset[0] = 0; 734 return INVALID_2022; 735 } 736 togo = (key[0] << 5) + togo; 737 738 while (hi != low) { /* binary search */ 739 int mid = (hi+low) >> 1; /* Finds median */ 740 741 if (mid == oldmid) { 742 break; 743 } 744 745 if (escSeqStateTable_Key_2022[mid] > togo) { 746 hi = mid; 747 } else if (escSeqStateTable_Key_2022[mid] < togo) { 748 low = mid; 749 } else /* we found it */ { 750 key[0] = togo; 751 offset[0] = mid; 752 return escSeqStateTable_Value_2022[mid]; 753 } 754 oldmid = mid; 755 } 756 return INVALID_2022; 757 } 758 759 /* 760 * To Unicode Callback helper function 761 */ 762 private static CoderResult toUnicodeCallback(CharsetDecoderICU cnv, int sourceChar, int targetUniChar) { 763 CoderResult err = CoderResult.UNDERFLOW; 764 if (sourceChar > 0xff) { 765 cnv.toUBytesArray[0] = (byte)(sourceChar>>8); 766 cnv.toUBytesArray[1] = (byte)sourceChar; 767 cnv.toULength = 2; 768 } else { 769 cnv.toUBytesArray[0] = (byte)sourceChar; 770 cnv.toULength = 1; 771 } 772 773 if (targetUniChar == (UConverterConstants.missingCharMarker-1/* 0xfffe */)) { 774 err = CoderResult.unmappableForLength(1); 775 } else { 776 err = CoderResult.malformedForLength(1); 777 } 778 779 return err; 780 } 781 782 /****************************ISO-2022-JP************************************/ 783 private class CharsetDecoderISO2022JP extends CharsetDecoderICU { 784 public CharsetDecoderISO2022JP(CharsetICU cs) { 785 super(cs); 786 } 787 788 @Override 789 protected void implReset() { 790 super.implReset(); 791 myConverterData.reset(); 792 } 793 /* 794 * Map 00..7F to Unicode according to JIS X 0201. 795 * */ 796 private int jisx201ToU(int value) { 797 if (value < 0x5c) { 798 return value; 799 } else if (value == 0x5c) { 800 return 0xa5; 801 } else if (value == 0x7e) { 802 return 0x203e; 803 } else { /* value <= 0x7f */ 804 return value; 805 } 806 } 807 /* 808 * Convert a pair of JIS X 208 21..7E bytes to Shift-JIS. 809 * If either byte is outside 21..7E make sure that the result is not valid 810 * for Shift-JIS so that the converter catches it. 811 * Some invalid byte values already turn into equally invalid Shift-JIS 812 * byte values and need not be tested explicitly. 813 */ 814 private void _2022ToSJIS(char c1, char c2, byte []bytes) { 815 if ((c1&1) > 0) { 816 ++c1; 817 if (c2 <= 0x5f) { 818 c2 += 0x1f; 819 } else if (c2 <= 0x7e) { 820 c2 += 0x20; 821 } else { 822 c2 = 0; /* invalid */ 823 } 824 } else { 825 if ((c2 >= 0x21) && (c2 <= 0x7e)) { 826 c2 += 0x7e; 827 } else { 828 c2 = 0; /* invalid */ 829 } 830 } 831 832 c1 >>=1; 833 if (c1 <= 0x2f) { 834 c1 += 0x70; 835 } else if (c1 <= 0x3f) { 836 c1 += 0xb0; 837 } else { 838 c1 = 0; /* invalid */ 839 } 840 bytes[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c1); 841 bytes[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c2); 842 } 843 844 @Override 845 @SuppressWarnings("fallthrough") 846 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 847 boolean gotoGetTrail = false; 848 boolean gotoEscape = false; 849 CoderResult err = CoderResult.UNDERFLOW; 850 byte []tempBuf = new byte[2]; 851 int targetUniChar = 0x0000; 852 int mySourceChar = 0x0000; 853 int mySourceCharTemp = 0x0000; // use for getTrail label call. 854 byte cs; /* StateEnum */ 855 byte csTemp= 0; // use for getTrail label call. 856 857 if (myConverterData.key != 0) { 858 /* continue with a partial escape sequence */ 859 // goto escape; 860 gotoEscape = true; 861 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) { 862 /* continue with a partial double-byte character */ 863 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK); 864 toULength = 0; 865 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g]; 866 // goto getTrailByte; 867 mySourceCharTemp = 0x99; 868 gotoGetTrail = true; 869 } 870 871 while (source.hasRemaining() || gotoEscape || gotoGetTrail) { 872 // This code is here for the goto escape label call above. 873 if (gotoEscape) { 874 mySourceCharTemp = ESC_2022; 875 } 876 877 targetUniChar = UConverterConstants.missingCharMarker; 878 879 if (gotoEscape || gotoGetTrail || target.hasRemaining()) { 880 if (!gotoEscape && !gotoGetTrail) { 881 mySourceChar = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK; 882 mySourceCharTemp = mySourceChar; 883 } 884 885 switch (mySourceCharTemp) { 886 case UConverterConstants.SI: 887 if (myConverterData.version == 3) { 888 myConverterData.toU2022State.g = 0; 889 continue; 890 } else { 891 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 892 myConverterData.isEmptySegment = false; 893 break; 894 } 895 896 case UConverterConstants.SO: 897 if (myConverterData.version == 3) { 898 /* JIS7: switch to G1 half-width Katakana */ 899 myConverterData.toU2022State.cs[1] = HWKANA_7BIT; 900 myConverterData.toU2022State.g = 1; 901 continue; 902 } else { 903 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 904 myConverterData.isEmptySegment = false; /* reset this, we have a different error */ 905 break; 906 } 907 908 case ESC_2022: 909 if (!gotoEscape) { 910 source.position(source.position() - 1); 911 } else { 912 gotoEscape = false; 913 } 914// escape: 915 { 916 int mySourceBefore = source.position(); 917 int toULengthBefore = this.toULength; 918 919 err = changeState_2022(this, source, variant); 920 921 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 922 if(myConverterData.version == 0 && myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) { 923 err = CoderResult.malformedForLength(source.position() - mySourceBefore); 924 this.toULength = toULengthBefore + (source.position() - mySourceBefore); 925 } 926 } 927 928 /* invalid or illegal escape sequence */ 929 if(err.isError()){ 930 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */ 931 return err; 932 } 933 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 934 if(myConverterData.key == 0) { 935 myConverterData.isEmptySegment = true; 936 } 937 938 continue; 939 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 940 case CR: 941 /* falls through */ 942 case LF: 943 /* automatically reset to single-byte mode */ 944 if (myConverterData.toU2022State.cs[0] != ASCII && myConverterData.toU2022State.cs[0] != JISX201) { 945 myConverterData.toU2022State.cs[0] = ASCII; 946 } 947 myConverterData.toU2022State.cs[2] = 0; 948 myConverterData.toU2022State.g = 0; 949 /* falls through */ 950 default : 951 /* convert one or two bytes */ 952 myConverterData.isEmptySegment = false; 953 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g]; 954 csTemp = cs; 955 if (gotoGetTrail) { 956 csTemp = (byte)0x99; 957 } 958 if (!gotoGetTrail && ((mySourceChar >= 0xa1) && (mySourceChar <= 0xdf) && myConverterData.version == 4 && !IS_JP_DBCS(cs))) { 959 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 960 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 961 962 /* return from a single-shift state to the previous one */ 963 if (myConverterData.toU2022State.g >= 2) { 964 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG; 965 } 966 } else { 967 switch(csTemp) { 968 case ASCII: 969 if (mySourceChar <= 0x7f) { 970 targetUniChar = mySourceChar; 971 } 972 break; 973 case ISO8859_1: 974 if (mySourceChar <= 0x7f) { 975 targetUniChar = mySourceChar + 0x80; 976 } 977 /* return from a single-shift state to the prevous one */ 978 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG; 979 break; 980 case ISO8859_7: 981 if (mySourceChar <= 0x7f) { 982 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 983 targetUniChar = CharsetMBCS.MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myConverterData.myConverterArray[cs].mbcs, 984 mySourceChar+0x80); 985 } 986 /* return from a single-shift state to the previous one */ 987 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG; 988 break; 989 case JISX201: 990 if (mySourceChar <= 0x7f) { 991 targetUniChar = jisx201ToU(mySourceChar); 992 } 993 break; 994 case HWKANA_7BIT: 995 if ((mySourceChar >= 0x21) && (mySourceChar <= 0x5f)) { 996 /* 7-bit halfwidth Katakana */ 997 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 998 break; 999 } 1000 default : 1001 /* G0 DBCS */ 1002 if (gotoGetTrail || source.hasRemaining()) { 1003// getTrailByte: 1004 int tmpSourceChar; 1005 gotoGetTrail = false; 1006 short trailByte; 1007 boolean leadIsOk, trailIsOk; 1008 1009 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK); 1010 /* 1011 * Ticket 5691: consistent illegal sequences: 1012 * - We include at least the first byte in the illegal sequence. 1013 * - If any of the non-initial bytes could be the start of a character, 1014 * we stop the illegal sequence before the first one of those. 1015 * 1016 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 1017 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1018 * Otherwise we convert or report the pair of bytes. 1019 */ 1020 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21); 1021 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21); 1022 if (leadIsOk && trailIsOk) { 1023 source.get(); 1024 tmpSourceChar = (mySourceChar << 8) | trailByte; 1025 if (cs == JISX208) { 1026 _2022ToSJIS((char)mySourceChar, (char)trailByte, tempBuf); 1027 mySourceChar = tmpSourceChar; 1028 } else { 1029 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 1030 mySourceChar = tmpSourceChar; 1031 if (cs == KSC5601) { 1032 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 1033 } 1034 tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8)); 1035 tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar); 1036 } 1037 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], ByteBuffer.wrap(tempBuf), false); 1038 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1039 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1040 source.get(); 1041 /* add another bit so that the code below writes 2 bytes in case of error */ 1042 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 1043 } 1044 } else { 1045 toUBytesArray[0] = (byte)mySourceChar; 1046 toULength = 1; 1047 // goto endloop 1048 return err; 1049 } 1050 } /* end of inner switch */ 1051 } 1052 break; 1053 } /* end of outer switch */ 1054 1055 if (targetUniChar < (UConverterConstants.missingCharMarker-1/*0xfffe*/)) { 1056 if (offsets != null) { 1057 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2)); 1058 } 1059 target.put((char)targetUniChar); 1060 } else if (targetUniChar > UConverterConstants.missingCharMarker) { 1061 /* disassemble the surrogate pair and write to output */ 1062 targetUniChar -= 0x0010000; 1063 target.put((char)(0xd800 + (char)(targetUniChar>>10))); 1064 target.position(target.position()-1); 1065 if (offsets != null) { 1066 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2)); 1067 } 1068 target.get(); 1069 if (target.hasRemaining()) { 1070 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff))); 1071 target.position(target.position()-1); 1072 if (offsets != null) { 1073 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2)); 1074 } 1075 target.get(); 1076 } else { 1077 charErrorBufferArray[charErrorBufferLength++] = 1078 (char)(0xdc00+(char)(targetUniChar&0x3ff)); 1079 } 1080 } else { 1081 /* Call the callback function */ 1082 err = toUnicodeCallback(this, mySourceChar, targetUniChar); 1083 break; 1084 } 1085 } else { /* goes with "if (target.hasRemaining())" way up near the top of the function */ 1086 err = CoderResult.OVERFLOW; 1087 break; 1088 } 1089 } 1090//endloop: 1091 return err; 1092 } 1093 } // end of class CharsetDecoderISO2022JP 1094 1095 /****************************ISO-2022-CN************************************/ 1096 private class CharsetDecoderISO2022CN extends CharsetDecoderICU { 1097 public CharsetDecoderISO2022CN(CharsetICU cs) { 1098 super(cs); 1099 } 1100 1101 @Override 1102 protected void implReset() { 1103 super.implReset(); 1104 myConverterData.reset(); 1105 } 1106 1107 @Override 1108 @SuppressWarnings("fallthrough") 1109 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 1110 CoderResult err = CoderResult.UNDERFLOW; 1111 byte[] tempBuf = new byte[3]; 1112 int targetUniChar = 0x0000; 1113 int mySourceChar = 0x0000; 1114 int mySourceCharTemp = 0x0000; 1115 boolean gotoEscape = false; 1116 boolean gotoGetTrailByte = false; 1117 1118 if (myConverterData.key != 0) { 1119 /* continue with a partial escape sequence */ 1120 // goto escape; 1121 gotoEscape = true; 1122 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) { 1123 /* continue with a partial double-byte character */ 1124 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK); 1125 toULength = 0; 1126 targetUniChar = UConverterConstants.missingCharMarker; 1127 // goto getTrailByte 1128 gotoGetTrailByte = true; 1129 } 1130 1131 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) { 1132 targetUniChar = UConverterConstants.missingCharMarker; 1133 1134 if (target.hasRemaining() || gotoEscape) { 1135 if (gotoEscape) { 1136 mySourceChar = ESC_2022; // goto escape label 1137 mySourceCharTemp = mySourceChar; 1138 } else if (gotoGetTrailByte) { 1139 mySourceCharTemp = 0xff; // goto getTrailByte; set mySourceCharTemp to go to default 1140 } else { 1141 mySourceChar = UConverterConstants.UNSIGNED_BYTE_MASK & source.get(); 1142 mySourceCharTemp = mySourceChar; 1143 } 1144 1145 switch (mySourceCharTemp) { 1146 case UConverterConstants.SI: 1147 myConverterData.toU2022State.g = 0; 1148 if (myConverterData.isEmptySegment) { 1149 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ 1150 err = CoderResult.malformedForLength(1); 1151 this.toUBytesArray[0] = (byte)mySourceChar; 1152 this.toULength = 1; 1153 return err; 1154 } 1155 continue; 1156 1157 case UConverterConstants.SO: 1158 if (myConverterData.toU2022State.cs[1] != 0) { 1159 myConverterData.toU2022State.g = 1; 1160 myConverterData.isEmptySegment = true; /* Begin a new segment, empty so far */ 1161 continue; 1162 } else { 1163 /* illegal to have SO before a matching designator */ 1164 myConverterData.isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */ 1165 break; 1166 } 1167 1168 case ESC_2022: 1169 if (!gotoEscape) { 1170 source.position(source.position()-1); 1171 } 1172// escape label 1173 gotoEscape = false; 1174 { 1175 int mySourceBefore = source.position(); 1176 int toULengthBefore = this.toULength; 1177 1178 err = changeState_2022(this, source, ISO_2022_CN); 1179 1180 /* After SO there must be at least one character before a designator (designator error handled separately) */ 1181 if(myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) { 1182 err = CoderResult.malformedForLength(source.position() - mySourceBefore); 1183 this.toULength = toULengthBefore + (source.position() - mySourceBefore); 1184 } 1185 } 1186 1187 /* invalid or illegal escape sequence */ 1188 if(err.isError()){ 1189 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */ 1190 return err; 1191 } 1192 continue; 1193 1194 /*ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 1195 case CR: 1196 /* falls through */ 1197 case LF: 1198 myConverterData.toU2022State.reset(); 1199 /* falls through */ 1200 default: 1201 /* converter one or two bytes */ 1202 myConverterData.isEmptySegment = false; 1203 if (myConverterData.toU2022State.g != 0 || gotoGetTrailByte) { 1204 if (source.hasRemaining() || gotoGetTrailByte) { 1205 UConverterSharedData cnv; 1206 byte tempState; 1207 int tempBufLen; 1208 boolean leadIsOk, trailIsOk; 1209 short trailByte; 1210// getTrailByte: label 1211 gotoGetTrailByte = false; // reset gotoGetTrailByte 1212 1213 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK); 1214 /* 1215 * Ticket 5691: consistent illegal sequences: 1216 * - We include at least the first byte in the illegal sequence. 1217 * - If any of the non-initial bytes could be the start of a character, 1218 * we stop the illegal sequence before the first one of those. 1219 * 1220 * In ISO-2022 DBCS, if the second byte is in the range 21..7e range or is 1221 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1222 * Otherwise we convert or report the pair of bytes. 1223 */ 1224 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21); 1225 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21); 1226 if (leadIsOk && trailIsOk) { 1227 source.get(); 1228 tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g]; 1229 if (tempState > CNS_11643_0) { 1230 cnv = myConverterData.myConverterArray[CNS_11643]; 1231 tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0)); 1232 tempBuf[1] = (byte)mySourceChar; 1233 tempBuf[2] = (byte)trailByte; 1234 tempBufLen = 3; 1235 } else { 1236 cnv = myConverterData.myConverterArray[tempState]; 1237 tempBuf[0] = (byte)mySourceChar; 1238 tempBuf[1] = (byte)trailByte; 1239 tempBufLen = 2; 1240 } 1241 ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf); 1242 tempBuffer.limit(tempBufLen); 1243 targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false); 1244 mySourceChar = (mySourceChar << 8) | trailByte; 1245 1246 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1247 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1248 source.get(); 1249 /* add another bit so that the code below writes 2 bytes in case of error */ 1250 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 1251 } 1252 if (myConverterData.toU2022State.g >= 2) { 1253 /* return from a single-shift state to the previous one */ 1254 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG; 1255 } 1256 } else { 1257 toUBytesArray[0] = (byte)mySourceChar; 1258 toULength = 1; 1259 // goto endloop; 1260 return err; 1261 } 1262 } else { 1263 if (mySourceChar <= 0x7f) { 1264 targetUniChar = (char)mySourceChar; 1265 } 1266 } 1267 break; 1268 } 1269 if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) < (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker-1))) { 1270 if (offsets != null) { 1271 offsets.array()[target.position()] = source.remaining() - (mySourceChar <= 0xff ? 1 : 2); 1272 } 1273 target.put((char)targetUniChar); 1274 } else if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) > (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker))) { 1275 /* disassemble the surrogate pair and write to output */ 1276 targetUniChar -= 0x0010000; 1277 target.put((char)(0xd800+(char)(targetUniChar>>10))); 1278 if (offsets != null) { 1279 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2); 1280 } 1281 if (target.hasRemaining()) { 1282 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff))); 1283 if (offsets != null) { 1284 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2); 1285 } 1286 } else { 1287 charErrorBufferArray[charErrorBufferLength++] = (char)(0xdc00+(char)(targetUniChar&0x3ff)); 1288 } 1289 } else { 1290 /* Call the callback function */ 1291 err = toUnicodeCallback(this, mySourceChar, targetUniChar); 1292 break; 1293 } 1294 1295 } else { 1296 err = CoderResult.OVERFLOW; 1297 break; 1298 } 1299 } 1300 1301 return err; 1302 } 1303 1304 } 1305 /************************ ISO-2022-KR ********************/ 1306 private class CharsetDecoderISO2022KR extends CharsetDecoderICU { 1307 public CharsetDecoderISO2022KR(CharsetICU cs) { 1308 super(cs); 1309 } 1310 1311 @Override 1312 protected void implReset() { 1313 super.implReset(); 1314 setInitialStateToUnicodeKR(); 1315 myConverterData.reset(); 1316 } 1317 1318 @Override 1319 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 1320 CoderResult err = CoderResult.UNDERFLOW; 1321 int mySourceChar = 0x0000; 1322 int targetUniChar = 0x0000; 1323 byte[] tempBuf = new byte[2]; 1324 boolean usingFallback; 1325 boolean gotoGetTrailByte = false; 1326 boolean gotoEscape = false; 1327 1328 if (myConverterData.version == 1) { 1329 return decodeLoopIBM(myConverterData.currentDecoder, source, target, offsets, flush); 1330 } 1331 1332 /* initialize state */ 1333 usingFallback = isFallbackUsed(); 1334 1335 if (myConverterData.key != 0) { 1336 /* continue with a partial escape sequence */ 1337 gotoEscape = true; 1338 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) { 1339 /* continue with a partial double-byte character */ 1340 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK); 1341 toULength = 0; 1342 gotoGetTrailByte = true; 1343 } 1344 1345 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) { 1346 if (target.hasRemaining() || gotoGetTrailByte || gotoEscape) { 1347 if (!gotoGetTrailByte && !gotoEscape) { 1348 mySourceChar = (char)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); 1349 } 1350 1351 if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SI) { 1352 myConverterData.toU2022State.g = 0; 1353 if (myConverterData.isEmptySegment) { 1354 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ 1355 err = CoderResult.malformedForLength(1); 1356 this.toUBytesArray[0] = (byte)mySourceChar; 1357 this.toULength = 1; 1358 return err; 1359 } 1360 /* consume the source */ 1361 continue; 1362 } else if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SO) { 1363 myConverterData.toU2022State.g = 1; 1364 myConverterData.isEmptySegment = true; 1365 /* consume the source */ 1366 continue; 1367 } else if (!gotoGetTrailByte && (gotoEscape || mySourceChar == ESC_2022)) { 1368 if (!gotoEscape) { 1369 source.position(source.position()-1); 1370 } 1371// escape label 1372 gotoEscape = false; // reset gotoEscape flag 1373 myConverterData.isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */ 1374 err = changeState_2022(this, source, ISO_2022_KR); 1375 if (err.isError()) { 1376 return err; 1377 } 1378 continue; 1379 } 1380 myConverterData.isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */ 1381 if (myConverterData.toU2022State.g == 1 || gotoGetTrailByte) { 1382 if (source.hasRemaining() || gotoGetTrailByte) { 1383 boolean leadIsOk, trailIsOk; 1384 short trailByte; 1385// getTrailByte label 1386 gotoGetTrailByte = false; // reset gotoGetTrailByte flag 1387 1388 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK); 1389 targetUniChar = UConverterConstants.missingCharMarker; 1390 /* 1391 * Ticket 5691: consistent illegal sequences: 1392 * - We include at least the first byte in the illegal sequence. 1393 * - If any of the non-initial bytes could be the start of a character, 1394 * we stop the illegal sequence before the first one of those. 1395 * 1396 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 1397 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1398 * Otherwise we convert or report the pair of bytes. 1399 */ 1400 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21); 1401 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21); 1402 if (leadIsOk && trailIsOk) { 1403 source.get(); 1404 tempBuf[0] = (byte)(mySourceChar + 0x80); 1405 tempBuf[1] = (byte)(trailByte + 0x80); 1406 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, ByteBuffer.wrap(tempBuf), usingFallback); 1407 mySourceChar = (char)((mySourceChar << 8) | trailByte); 1408 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1409 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1410 source.get(); 1411 /* add another bit so that the code below writes 2 bytes in case of error */ 1412 mySourceChar = (char)(0x10000 | (mySourceChar << 8) | trailByte); 1413 } 1414 } else { 1415 toUBytesArray[0] = (byte)mySourceChar; 1416 toULength = 1; 1417 break; 1418 } 1419 } else if (mySourceChar <= 0x7f) { 1420 int savedSourceLimit = source.limit(); 1421 int savedSourcePosition = source.position(); 1422 source.limit(source.position()); 1423 source.position(source.position()-1); 1424 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, source, usingFallback); 1425 source.limit(savedSourceLimit); 1426 source.position(savedSourcePosition); 1427 } else { 1428 targetUniChar = 0xffff; 1429 } 1430 if (targetUniChar < 0xfffe) { 1431 target.put((char)targetUniChar); 1432 if (offsets != null) { 1433 offsets.array()[target.position()] = source.position() - (mySourceChar <= 0xff ? 1 : 2); 1434 } 1435 } else { 1436 /* Call the callback function */ 1437 err = toUnicodeCallback(this, mySourceChar, targetUniChar); 1438 break; 1439 } 1440 } else { 1441 err = CoderResult.OVERFLOW; 1442 break; 1443 } 1444 } 1445 1446 return err; 1447 } 1448 1449 protected CoderResult decodeLoopIBM(CharsetDecoderMBCS cnv, ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 1450 CoderResult err = CoderResult.UNDERFLOW; 1451 int sourceStart; 1452 int sourceLimit; 1453 int argSource; 1454 int argTarget; 1455 boolean gotoEscape = false; 1456 int oldSourceLimit; 1457 1458 /* remember the original start of the input for offsets */ 1459 sourceStart = argSource = source.position(); 1460 1461 if (myConverterData.key != 0) { 1462 /* continue with a partial escape sequence */ 1463 gotoEscape = true; 1464 } 1465 1466 while (gotoEscape || (!err.isError() && source.hasRemaining())) { 1467 if (!gotoEscape) { 1468 /* Find the end of the buffer e.g : Next Escape Seq | end of Buffer */ 1469 int oldSourcePos = source.position(); 1470 sourceLimit = getEndOfBuffer_2022(source); 1471 source.position(oldSourcePos); 1472 if (source.position() != sourceLimit) { 1473 /* 1474 * get the current partial byte sequence 1475 * 1476 * it needs to be moved between the public and the subconverter 1477 * so that the conversion frameword, which only sees the public 1478 * converter, can handle truncated and illegal input etc. 1479 */ 1480 if (toULength > 0) { 1481 cnv.toUBytesArray = toUBytesArray.clone(); 1482 } 1483 cnv.toULength = toULength; 1484 1485 /* 1486 * Convert up to the end of the input, or to before the next escape character. 1487 * Does not handle conversion extensions because the preToU[] state etc. 1488 * is not copied. 1489 */ 1490 argTarget = target.position(); 1491 oldSourceLimit = source.limit(); // save the old source limit change to new one 1492 source.limit(sourceLimit); 1493 err = myConverterData.currentDecoder.cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush); 1494 source.limit(oldSourceLimit); // restore source limit; 1495 if (offsets != null && sourceStart != argSource) { 1496 /* update offsets to base them on the actual start of the input */ 1497 int delta = argSource - sourceStart; 1498 while (argTarget < target.position()) { 1499 int currentOffset = offsets.get(); 1500 offsets.position(offsets.position()-1); 1501 if (currentOffset >= 0) { 1502 offsets.put(currentOffset + delta); 1503 offsets.position(offsets.position()-1); 1504 } 1505 offsets.get(); 1506 target.get(); 1507 } 1508 } 1509 argSource = source.position(); 1510 1511 /* copy input/error/overflow buffers */ 1512 if (cnv.toULength > 0) { 1513 toUBytesArray = cnv.toUBytesArray.clone(); 1514 } 1515 toULength = cnv.toULength; 1516 1517 if (err.isOverflow()) { 1518 if (cnv.charErrorBufferLength > 0) { 1519 charErrorBufferArray = cnv.charErrorBufferArray.clone(); 1520 } 1521 charErrorBufferLength = cnv.charErrorBufferLength; 1522 cnv.charErrorBufferLength = 0; 1523 } 1524 } 1525 1526 if (err.isError() || err.isOverflow() || (source.position() == source.limit())) { 1527 return err; 1528 } 1529 } 1530// escape label 1531 gotoEscape = false; 1532 err = changeState_2022(this, source, ISO_2022_KR); 1533 } 1534 return err; 1535 } 1536 } 1537 1538 /******************** from unicode **********************/ 1539 /* preference order of JP charsets */ 1540 private final static byte []jpCharsetPref = { 1541 ASCII, 1542 JISX201, 1543 ISO8859_1, 1544 JISX208, 1545 ISO8859_7, 1546 JISX212, 1547 GB2312, 1548 KSC5601, 1549 HWKANA_7BIT 1550 }; 1551 /* 1552 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1553 * not in order of jpCharsetPref[]! 1554 */ 1555 private final static byte [][]escSeqChars = { 1556 { 0x1B, 0x28, 0x42}, /* <ESC>(B ASCII */ 1557 { 0x1B, 0x2E, 0x41}, /* <ESC>.A ISO-8859-1 */ 1558 { 0x1B, 0x2E, 0x46}, /* <ESC>.F ISO-8859-7 */ 1559 { 0x1B, 0x28, 0x4A}, /* <ESC>(J JISX-201 */ 1560 { 0x1B, 0x24, 0x42}, /* <ESC>$B JISX-208 */ 1561 { 0x1B, 0x24, 0x28, 0x44}, /* <ESC>$(D JISX-212 */ 1562 { 0x1B, 0x24, 0x41}, /* <ESC>$A GB2312 */ 1563 { 0x1B, 0x24, 0x28, 0x43}, /* <ESC>$(C KSC5601 */ 1564 { 0x1B, 0x28, 0x49} /* <ESC>(I HWKANA_7BIT */ 1565 }; 1566 /* 1567 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1568 * Katakana. 1569 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1570 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1571 * These were the only fallbacks in ICU's jisx-208.ucm file. 1572 */ 1573 private final static char []hwkana_fb = { 1574 0x2123, /* U+FF61 */ 1575 0x2156, 1576 0x2157, 1577 0x2122, 1578 0x2126, 1579 0x2572, 1580 0x2521, 1581 0x2523, 1582 0x2525, 1583 0x2527, 1584 0x2529, 1585 0x2563, 1586 0x2565, 1587 0x2567, 1588 0x2543, 1589 0x213C, /* U+FF70 */ 1590 0x2522, 1591 0x2524, 1592 0x2526, 1593 0x2528, 1594 0x252A, 1595 0x252B, 1596 0x252D, 1597 0x252F, 1598 0x2531, 1599 0x2533, 1600 0x2535, 1601 0x2537, 1602 0x2539, 1603 0x253B, 1604 0x253D, 1605 0x253F, /* U+FF80 */ 1606 0x2541, 1607 0x2544, 1608 0x2546, 1609 0x2548, 1610 0x254A, 1611 0x254B, 1612 0x254C, 1613 0x254D, 1614 0x254E, 1615 0x254F, 1616 0x2552, 1617 0x2555, 1618 0x2558, 1619 0x255B, 1620 0x255E, 1621 0x255F, /* U+FF90 */ 1622 0x2560, 1623 0x2561, 1624 0x2562, 1625 0x2564, 1626 0x2566, 1627 0x2568, 1628 0x2569, 1629 0x256A, 1630 0x256B, 1631 0x256C, 1632 0x256D, 1633 0x256F, 1634 0x2573, 1635 0x212B, 1636 0x212C /* U+FF9F */ 1637 }; 1638 1639 protected byte [][]fromUSubstitutionChar = new byte[][]{ { (byte)0x1A }, { (byte)0x2F, (byte)0x7E} }; 1640 /****************************ISO-2022-JP************************************/ 1641 private class CharsetEncoderISO2022JP extends CharsetEncoderICU { 1642 public CharsetEncoderISO2022JP(CharsetICU cs) { 1643 super(cs, fromUSubstitutionChar[0]); 1644 } 1645 1646 @Override 1647 protected void implReset() { 1648 super.implReset(); 1649 myConverterData.reset(); 1650 } 1651 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1652 private int jisx201FromU(int value) { 1653 if (value <= 0x7f) { 1654 if (value != 0x5c && value != 0x7e) { 1655 return value; 1656 } 1657 } else if (value == 0xa5) { 1658 return 0x5c; 1659 } else if (value == 0x203e) { 1660 return 0x7e; 1661 } 1662 return (int)(UConverterConstants.UNSIGNED_INT_MASK & 0xfffe); 1663 } 1664 1665 /* 1666 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1667 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1668 * Return 0 if the byte pair is out of range. 1669 */ 1670 private int _2022FromSJIS(int value) { 1671 short trail; 1672 1673 if (value > 0xEFFC) { 1674 return 0; /* beyond JIS X 0208 */ 1675 } 1676 1677 trail = (short)(value & UConverterConstants.UNSIGNED_BYTE_MASK); 1678 1679 value &= 0xff00; /* lead byte */ 1680 if (value <= 0x9f00) { 1681 value -= 0x7000; 1682 } else { /* 0xe000 <= value <= 0xef00 */ 1683 value -= 0xb000; 1684 } 1685 1686 value <<= 1; 1687 1688 if (trail <= 0x9e) { 1689 value -= 0x100; 1690 if (trail <= 0x7e) { 1691 value |= ((trail - 0x1f) & UConverterConstants.UNSIGNED_BYTE_MASK); 1692 } else { 1693 value |= ((trail - 0x20) & UConverterConstants.UNSIGNED_BYTE_MASK); 1694 } 1695 } else { /* trail <= 0xfc */ 1696 value |= ((trail - 0x7e) & UConverterConstants.UNSIGNED_BYTE_MASK); 1697 } 1698 1699 return value; 1700 } 1701 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */ 1702 @Override 1703 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder, 1704 CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1705 CoderResult err = CoderResult.UNDERFLOW; 1706 byte[] buffer = new byte[8]; 1707 int i = 0; 1708 byte[] subchar; 1709 subchar = encoder.replacement(); 1710 1711 byte cs; 1712 if (myConverterData.fromU2022State.g == 1) { 1713 /* JIS7: switch from G1 to G0 */ 1714 myConverterData.fromU2022State.g = 0; 1715 buffer[i++] = UConverterConstants.SI; 1716 } 1717 cs = myConverterData.fromU2022State.cs[0]; 1718 1719 if (cs != ASCII && cs != JISX201) { 1720 /* not in ASCII or JIS X 0201: switch to ASCII */ 1721 myConverterData.fromU2022State.cs[0] = ASCII; 1722 buffer[i++] = 0x1B; 1723 buffer[i++] = 0x28; 1724 buffer[i++] = 0x42; 1725 } 1726 1727 buffer[i++] = subchar[0]; 1728 1729 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1); 1730 1731 return err; 1732 } 1733 1734 @Override 1735 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 1736 CoderResult err = CoderResult.UNDERFLOW; 1737 int sourceChar; 1738 byte cs, g; 1739 int choiceCount; 1740 int len, outLen; 1741 byte[] choices = new byte[10]; 1742 int targetValue = 0; 1743 boolean usingFallback; 1744 byte[] buffer = new byte[8]; 1745 boolean getTrail = false; // use for getTrail label 1746 int oldSourcePos; // for proper error handling 1747 1748 choiceCount = 0; 1749 1750 /* check if the last codepoint of previous buffer was a lead surrogate */ 1751 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) { 1752 getTrail = true; 1753 } 1754 1755 while (getTrail || source.hasRemaining()) { 1756 if (getTrail || target.hasRemaining()) { 1757 oldSourcePos = source.position(); 1758 if (!getTrail) { /* skip if going to getTrail label */ 1759 sourceChar = source.get(); 1760 } 1761 /* check if the char is a First surrogate */ 1762 if (getTrail || UTF16.isSurrogate((char)sourceChar)) { 1763 if (getTrail || UTF16.isLeadSurrogate((char)sourceChar)) { 1764// getTrail: 1765 if (getTrail) { 1766 getTrail = false; 1767 } 1768 /* look ahead to find the trail surrogate */ 1769 if (source.hasRemaining()) { 1770 /* test the following code unit */ 1771 char trail = source.get(); 1772 /* go back to the previous position */ 1773 source.position(source.position()-1); 1774 if (UTF16.isTrailSurrogate(trail)) { 1775 source.get(); 1776 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail); 1777 fromUChar32 = 0x00; 1778 /* convert this supplementary code point */ 1779 /* exit this condition tree */ 1780 } else { 1781 /* this is an unmatched lead code unit (1st surrogate) */ 1782 /* callback(illegal) */ 1783 err = CoderResult.malformedForLength(1); 1784 fromUChar32 = sourceChar; 1785 break; 1786 } 1787 } else { 1788 /* no more input */ 1789 fromUChar32 = sourceChar; 1790 break; 1791 } 1792 } else { 1793 /* this is an unmatched trail code unit (2nd surrogate) */ 1794 /* callback(illegal) */ 1795 err = CoderResult.malformedForLength(1); 1796 fromUChar32 = sourceChar; 1797 break; 1798 } 1799 } 1800 1801 /* do not convert SO/SI/ESC */ 1802 if (IS_2022_CONTROL(sourceChar)) { 1803 /* callback(illegal) */ 1804 err = CoderResult.malformedForLength(1); 1805 fromUChar32 = sourceChar; 1806 break; 1807 } 1808 1809 /* do the conversion */ 1810 1811 if (choiceCount == 0) { 1812 char csm; 1813 /* 1814 * The csm variable keeps track of which charsets are allowed 1815 * and not used yet while building the choices[]. 1816 */ 1817 csm = (char)jpCharsetMasks[myConverterData.version]; 1818 choiceCount = 0; 1819 1820 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1821 if (myConverterData.version == 3 || myConverterData.version == 4) { 1822 choices[choiceCount++] = HWKANA_7BIT; 1823 } 1824 /* Do not try single-bit half-width Katakana for other versions. */ 1825 csm &= ~CSM(HWKANA_7BIT); 1826 1827 /* try the current G0 charset */ 1828 choices[choiceCount++] = cs = myConverterData.fromU2022State.cs[0]; 1829 csm &= ~CSM(cs); 1830 1831 /* try the current G2 charset */ 1832 if ((cs = myConverterData.fromU2022State.cs[2]) != 0) { 1833 choices[choiceCount++] = cs; 1834 csm &= ~CSM(cs); 1835 } 1836 1837 /* try all the other charsets */ 1838 for (int i = 0; i < jpCharsetPref.length; i++) { 1839 cs = jpCharsetPref[i]; 1840 if ((CSM(cs) & csm) != 0) { 1841 choices[choiceCount++] = cs; 1842 csm &= ~CSM(cs); 1843 } 1844 } 1845 } 1846 1847 cs = g = 0; 1848 /* 1849 * len==0: no mapping found yet 1850 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1851 * len>0: found a roundtrip result, done 1852 */ 1853 len = 0; 1854 /* 1855 * We will turn off usingFallBack after finding a fallback, 1856 * but we still get fallbacks from PUA code points as usual. 1857 * Therefore, we will also need to check that we don't overwrite 1858 * an early fallback with a later one. 1859 */ 1860 usingFallback = useFallback; 1861 1862 for (int i = 0; i < choiceCount && len <= 0; i++) { 1863 int[] value = new int[1]; 1864 int len2; 1865 byte cs0 = choices[i]; 1866 switch (cs0) { 1867 case ASCII: 1868 if (sourceChar <= 0x7f) { 1869 targetValue = sourceChar; 1870 len = 1; 1871 cs = cs0; 1872 g = 0; 1873 } 1874 break; 1875 case ISO8859_1: 1876 if (GR96_START <= sourceChar && sourceChar <= GR96_END) { 1877 targetValue = sourceChar - 0x80; 1878 len = 1; 1879 cs = cs0; 1880 g = 2; 1881 } 1882 break; 1883 case HWKANA_7BIT: 1884 if (sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) { 1885 if (myConverterData.version == 3) { 1886 /* JIS7: use G1 (SO) */ 1887 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1888 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0x21))); 1889 len = 1; 1890 myConverterData.fromU2022State.cs[1] = cs = cs0; /* do not output an escape sequence */ 1891 g = 1; 1892 } else if (myConverterData.version == 4) { 1893 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1894 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1895 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0xa1))); 1896 len = 1; 1897 1898 cs = myConverterData.fromU2022State.cs[0]; 1899 if (IS_JP_DBCS(cs)) { 1900 /* switch from a DBCS charset to JISX201 */ 1901 cs = JISX201; 1902 } 1903 /* else stay in the current G0 charset */ 1904 g = 0; 1905 } 1906 /* else do not use HWKANA_7BIT with other versions */ 1907 } 1908 break; 1909 case JISX201: 1910 /* G0 SBCS */ 1911 value[0] = jisx201FromU(sourceChar); 1912 if (value[0] <= 0x7f) { 1913 targetValue = value[0]; 1914 len = 1; 1915 cs = cs0; 1916 g = 0; 1917 usingFallback = false; 1918 } 1919 break; 1920 case JISX208: 1921 /* G0 DBCS from JIS table */ 1922 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0]; 1923 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2; 1924 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback); 1925 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2); 1926 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len) == 2 */ 1927 value[0] = _2022FromSJIS(value[0]); 1928 if (value[0] != 0) { 1929 targetValue = value[0]; 1930 len = len2; 1931 cs = cs0; 1932 g = 0; 1933 usingFallback = false; 1934 } 1935 } else if (len == 0 && usingFallback && sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) { 1936 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1937 len = -2; 1938 cs = cs0; 1939 g = 0; 1940 usingFallback = false; 1941 } 1942 break; 1943 case ISO8859_7: 1944 /* G0 SBCS forced to 7-bit output */ 1945 len2 = MBCSSingleFromUChar32(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback); 1946 if (len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value[0] && value[0] <= GR96_END) { 1947 targetValue = value[0] - 0x80; 1948 len = len2; 1949 cs = cs0; 1950 g = 2; 1951 usingFallback = false; 1952 } 1953 break; 1954 default : 1955 /* G0 DBCS */ 1956 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0]; 1957 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2; 1958 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback); 1959 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2); 1960 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1961 if (cs0 == KSC5601) { 1962 /* 1963 * Check for valid bytes for the encoding scheme. 1964 * This is necessary because the sub-converter (windows-949) 1965 * has a broader encoding scheme than is valid for 2022. 1966 */ 1967 value[0] = _2022FromGR94DBCS(value[0]); 1968 if (value[0] == 0) { 1969 break; 1970 } 1971 } 1972 targetValue = value[0]; 1973 len = len2; 1974 cs = cs0; 1975 g = 0; 1976 usingFallback = false; 1977 } 1978 break; 1979 } 1980 } 1981 1982 if (len != 0) { 1983 if (len < 0) { 1984 len = -len; /* fallback */ 1985 } 1986 outLen = 0; 1987 1988 /* write SI if necessary (only for JIS7 */ 1989 if (myConverterData.fromU2022State.g == 1 && g == 0) { 1990 buffer[outLen++] = UConverterConstants.SI; 1991 myConverterData.fromU2022State.g = 0; 1992 } 1993 1994 /* write the designation sequence if necessary */ 1995 if (cs != myConverterData.fromU2022State.cs[g]) { 1996 for (int i = 0; i < escSeqChars[cs].length; i++) { 1997 buffer[outLen++] = escSeqChars[cs][i]; 1998 } 1999 myConverterData.fromU2022State.cs[g] = cs; 2000 2001 /* invalidate the choices[] */ 2002 choiceCount = 0; 2003 } 2004 2005 /* write the shift sequence if necessary */ 2006 if (g != myConverterData.fromU2022State.g) { 2007 switch (g) { 2008 /* case 0 handled before writing escapes */ 2009 case 1: 2010 buffer[outLen++] = UConverterConstants.SO; 2011 myConverterData.fromU2022State.g = 1; 2012 break; 2013 default : /* case 2 */ 2014 buffer[outLen++] = 0x1b; 2015 buffer[outLen++] = 0x4e; 2016 break; 2017 /* case 3: no SS3 in ISO-2022-JP-x */ 2018 } 2019 } 2020 2021 /* write the output bytes */ 2022 if (len == 1) { 2023 buffer[outLen++] = (byte)targetValue; 2024 } else { /* len == 2 */ 2025 buffer[outLen++] = (byte)(targetValue >> 8); 2026 buffer[outLen++] = (byte)targetValue; 2027 } 2028 }else { 2029 /* 2030 * if we cannot find the character after checking all codepages 2031 * then this is an error. 2032 */ 2033 err = CoderResult.unmappableForLength(source.position()-oldSourcePos); 2034 fromUChar32 = sourceChar; 2035 break; 2036 } 2037 2038 if (sourceChar == CR || sourceChar == LF) { 2039 /* reset the G2 state at the end of a line (conversion got use into ASCII or JISX201 already) */ 2040 myConverterData.fromU2022State.cs[2] = 0; 2041 choiceCount = 0; 2042 } 2043 2044 /* output outLen>0 bytes in buffer[] */ 2045 if (outLen == 1) { 2046 target.put(buffer[0]); 2047 if (offsets != null) { 2048 offsets.put(source.remaining() - 1); /* -1 known to be ASCII */ 2049 } 2050 } else if (outLen == 2 && (target.position() + 2) <= target.limit()) { 2051 target.put(buffer[0]); 2052 target.put(buffer[1]); 2053 if (offsets != null) { 2054 int sourceIndex = source.position() - 1; 2055 offsets.put(sourceIndex); 2056 offsets.put(sourceIndex); 2057 } 2058 } else { 2059 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, source.position()-1); 2060 } 2061 } else { 2062 err = CoderResult.OVERFLOW; 2063 break; 2064 } 2065 } 2066 2067 /* 2068 * the end of the input stream and detection of truncated input 2069 * are handled by the framework, but for ISO-2022-JP conversion 2070 * we need to be in ASCII mode at the very end 2071 * 2072 * conditions: 2073 * successful 2074 * in SO mode or not in ASCII mode 2075 * end of input and no truncated input 2076 */ 2077 if (!err.isError() && 2078 (myConverterData.fromU2022State.g != 0 || myConverterData.fromU2022State.cs[0] != ASCII) && 2079 flush && !source.hasRemaining() && fromUChar32 == 0) { 2080 int sourceIndex; 2081 2082 outLen = 0; 2083 2084 if (myConverterData.fromU2022State.g != 0) { 2085 buffer[outLen++] = UConverterConstants.SI; 2086 myConverterData.fromU2022State.g = 0; 2087 } 2088 2089 if (myConverterData.fromU2022State.cs[0] != ASCII) { 2090 for (int i = 0; i < escSeqChars[ASCII].length; i++) { 2091 buffer[outLen++] = escSeqChars[ASCII][i]; 2092 } 2093 myConverterData.fromU2022State.cs[0] = ASCII; 2094 } 2095 2096 /* get the source index of the last input character */ 2097 sourceIndex = source.position(); 2098 if (sourceIndex > 0) { 2099 --sourceIndex; 2100 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && 2101 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) { 2102 --sourceIndex; 2103 } 2104 } else { 2105 sourceIndex = -1; 2106 } 2107 2108 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, sourceIndex); 2109 } 2110 return err; 2111 } 2112 } 2113 /****************************ISO-2022-CN************************************/ 2114 /* 2115 * Rules for ISO-2022-CN Encoding: 2116 * i) The designator sequence must appear once on a line before any instance 2117 * of chracter set it designates. 2118 * ii) If two lines contain characters from the same character set, both lines 2119 * must include the designator sequence. 2120 * iii) Once the designator sequence is known, a shifting sequence has to be found 2121 * to invoke the shifting 2122 * iv) All lines start in ASCII and end in ASCII. 2123 * v) Four shifting sequences are employed for this purpose: 2124 * Sequence ASCII Eq Charsets 2125 * --------- --------- -------- 2126 * SI <SI> US-ASCII 2127 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2128 * SS2 <ESC>N CNS-11643-1992 Plane 2 2129 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2130 * vi) 2131 * SOdesignator : ESC "$" ")" finalchar_for_SO 2132 * SS2designator : ESC "$" "*" finalchar_for_SS2 2133 * SS3designator : ESC "$" "+" finalchar_for_SS3 2134 * 2135 * ESC $ ) A Indicates the bytes following SO are Chinese 2136 * characters as defined in GB 2312-80, until 2137 * another SOdesignation appears 2138 * 2139 * ESC $ ) E Indicates the bytes following SO are as defined 2140 * in ISO-IR-165 (for details, see section 2.1), 2141 * until another SOdesignation appears 2142 * 2143 * ESC $ ) G Indicates the bytes following SO are as defined 2144 * in CNS 11643-plane-1, until another SOdesignation appears 2145 * 2146 * ESC $ * H Indicates teh two bytes immediately following 2147 * SS2 is a Chinese character as defined in CNS 2148 * 11643-plane-2, until another SS2designation 2149 * appears 2150 * (Meaning <ESC>N must preceed ever 2 byte sequence.) 2151 * 2152 * ESC $ + I Indicates the immediate two bytes following SS3 2153 * is a Chinese character as defined in CNS 2154 * 11643-plane-3, until another SS3designation 2155 * appears 2156 * (Meaning <ESC>O must preceed every 2 byte sequence.) 2157 * 2158 * ESC $ + J Indicates the immediate two bytes following SS3 2159 * is a Chinese character as defined in CNS 2160 * 11643-plane-4, until another SS3designation 2161 * appears 2162 * (In English: <ESC>O must preceed every 2 byte sequence.) 2163 * 2164 * ESC $ + K Indicates the immediate two bytes following SS3 2165 * is a Chinese character as defined in CNS 2166 * 11643-plane-5, until another SS3designation 2167 * appears 2168 * 2169 * ESC $ + L Indicates the immediate two bytes following SS3 2170 * is a Chinese character as defined in CNS 2171 * 11643-plane-6, until another SS3designation 2172 * appears 2173 * 2174 * ESC $ + M Indicates the immediate two bytes following SS3 2175 * is a Chinese character as defined in CNS 2176 * 11643-plane-7, until another SS3designation 2177 * appears 2178 * 2179 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2180 * has its own designation information before any Chinese chracters 2181 * appears 2182 */ 2183 2184 /* The following are defined this way to make strings truely readonly */ 2185 private final static byte[] GB_2312_80_STR = { 0x1B, 0x24, 0x29, 0x41 }; 2186 private final static byte[] ISO_IR_165_STR = { 0x1B, 0x24, 0x29, 0x45 }; 2187 private final static byte[] CNS_11643_1992_Plane_1_STR = { 0x1B, 0x24, 0x29, 0x47 }; 2188 private final static byte[] CNS_11643_1992_Plane_2_STR = { 0x1B, 0x24, 0x2A, 0x48 }; 2189 private final static byte[] CNS_11643_1992_Plane_3_STR = { 0x1B, 0x24, 0x2B, 0x49 }; 2190 private final static byte[] CNS_11643_1992_Plane_4_STR = { 0x1B, 0x24, 0x2B, 0x4A }; 2191 private final static byte[] CNS_11643_1992_Plane_5_STR = { 0x1B, 0x24, 0x2B, 0x4B }; 2192 private final static byte[] CNS_11643_1992_Plane_6_STR = { 0x1B, 0x24, 0x2B, 0x4C }; 2193 private final static byte[] CNS_11643_1992_Plane_7_STR = { 0x1B, 0x24, 0x2B, 0x4D }; 2194 2195 /************************ ISO2022-CN Data *****************************/ 2196 private final static byte[][] escSeqCharsCN = { 2197 SHIFT_IN_STR, 2198 GB_2312_80_STR, 2199 ISO_IR_165_STR, 2200 CNS_11643_1992_Plane_1_STR, 2201 CNS_11643_1992_Plane_2_STR, 2202 CNS_11643_1992_Plane_3_STR, 2203 CNS_11643_1992_Plane_4_STR, 2204 CNS_11643_1992_Plane_5_STR, 2205 CNS_11643_1992_Plane_6_STR, 2206 CNS_11643_1992_Plane_7_STR, 2207 }; 2208 2209 private class CharsetEncoderISO2022CN extends CharsetEncoderICU { 2210 public CharsetEncoderISO2022CN(CharsetICU cs) { 2211 super(cs, fromUSubstitutionChar[0]); 2212 } 2213 2214 @Override 2215 protected void implReset() { 2216 super.implReset(); 2217 myConverterData.reset(); 2218 } 2219 2220 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */ 2221 @Override 2222 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder, 2223 CharBuffer source, ByteBuffer target, IntBuffer offsets){ 2224 CoderResult err = CoderResult.UNDERFLOW; 2225 byte[] buffer = new byte[8]; 2226 int i = 0; 2227 byte[] subchar; 2228 subchar = encoder.replacement(); 2229 2230 if (myConverterData.fromU2022State.g != 0) { 2231 /* not in ASCII mode: switch to ASCII */ 2232 myConverterData.fromU2022State.g = 0; 2233 buffer[i++] = UConverterConstants.SI; 2234 } 2235 buffer[i++] = subchar[0]; 2236 2237 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1); 2238 2239 return err; 2240 } 2241 2242 @Override 2243 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 2244 CoderResult err = CoderResult.UNDERFLOW; 2245 int sourceChar; 2246 byte[] buffer = new byte[8]; 2247 int len; 2248 byte[] choices = new byte[3]; 2249 int choiceCount; 2250 int targetValue = 0; 2251 boolean usingFallback; 2252 boolean gotoGetTrail = false; 2253 int oldSourcePos; // For proper error handling 2254 2255 choiceCount = 0; 2256 2257 /* check if the last codepoint of previous buffer was a lead surrogate */ 2258 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) { 2259 // goto getTrail label 2260 gotoGetTrail = true; 2261 } 2262 2263 while (source.hasRemaining() || gotoGetTrail) { 2264 if (target.hasRemaining() || gotoGetTrail) { 2265 oldSourcePos = source.position(); 2266 if (!gotoGetTrail) { 2267 sourceChar = source.get(); 2268 } 2269 /* check if the char is a First surrogate */ 2270 if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) { 2271 if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) { 2272// getTrail label 2273 /* reset gotoGetTrail flag*/ 2274 gotoGetTrail = false; 2275 2276 /* look ahead to find the trail surrogate */ 2277 if (source.hasRemaining()) { 2278 /* test the following code unit */ 2279 char trail = source.get(); 2280 source.position(source.position()-1); 2281 if (UTF16.isTrailSurrogate(trail)) { 2282 source.get(); 2283 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail); 2284 fromUChar32 = 0x00; 2285 /* convert this supplementary code point */ 2286 /* exit this condition tree */ 2287 } else { 2288 /* this is an unmatched lead code unit (1st surrogate) */ 2289 /* callback(illegal) */ 2290 err = CoderResult.malformedForLength(1); 2291 fromUChar32 = sourceChar; 2292 break; 2293 } 2294 } else { 2295 /* no more input */ 2296 fromUChar32 = sourceChar; 2297 break; 2298 } 2299 } else { 2300 /* this is an unmatched trail code unit (2nd surrogate) */ 2301 /* callback(illegal) */ 2302 err = CoderResult.malformedForLength(1); 2303 fromUChar32 = sourceChar; 2304 break; 2305 } 2306 } 2307 2308 /* do the conversion */ 2309 if (sourceChar <= 0x007f) { 2310 /* do not converter SO/SI/ESC */ 2311 if (IS_2022_CONTROL(sourceChar)) { 2312 /* callback(illegal) */ 2313 err = CoderResult.malformedForLength(1); 2314 fromUChar32 = sourceChar; 2315 break; 2316 } 2317 2318 /* US-ASCII */ 2319 if (myConverterData.fromU2022State.g == 0) { 2320 buffer[0] = (byte)sourceChar; 2321 len = 1; 2322 } else { 2323 buffer[0] = UConverterConstants.SI; 2324 buffer[1] = (byte)sourceChar; 2325 len = 2; 2326 myConverterData.fromU2022State.g = 0; 2327 choiceCount = 0; 2328 } 2329 2330 if (sourceChar == CR || sourceChar == LF) { 2331 /* reset the state at the end of a line */ 2332 myConverterData.fromU2022State.reset(); 2333 choiceCount = 0; 2334 } 2335 } else { 2336 /* convert U+0080..U+10ffff */ 2337 int i; 2338 byte cs, g; 2339 2340 if (choiceCount == 0) { 2341 /* try the current SO/G1 converter first */ 2342 choices[0] = myConverterData.fromU2022State.cs[1]; 2343 2344 /* default to GB2312_1 if none is designated yet */ 2345 if (choices[0] == 0) { 2346 choices[0] = GB2312_1; 2347 } 2348 if (myConverterData.version == 0) { 2349 /* ISO-2022-CN */ 2350 /* try other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 2351 if (choices[0] == GB2312_1) { 2352 choices[1] = CNS_11643_1; 2353 } else { 2354 choices[1] = GB2312_1; 2355 } 2356 2357 choiceCount = 2; 2358 } else if (myConverterData.version == 1) { 2359 /* ISO-2022-CN-EXT */ 2360 2361 /* try one of the other converters */ 2362 switch (choices[0]) { 2363 case GB2312_1: 2364 choices[1] = CNS_11643_1; 2365 choices[2] = ISO_IR_165; 2366 break; 2367 case ISO_IR_165: 2368 choices[1] = GB2312_1; 2369 choices[2] = CNS_11643_1; 2370 break; 2371 default : 2372 choices[1] = GB2312_1; 2373 choices[2] = ISO_IR_165; 2374 break; 2375 } 2376 2377 choiceCount = 3; 2378 } else { 2379 /* ISO-2022-CN-CNS */ 2380 choices[0] = CNS_11643_1; 2381 choices[1] = GB2312_1; 2382 2383 choiceCount = 2; 2384 } 2385 } 2386 2387 cs = g = 0; 2388 /* 2389 * len==0: no mapping found yet 2390 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 2391 * len>0: found a roundtrip result, done 2392 */ 2393 len = 0; 2394 /* 2395 * We will turn off usingFallback after finding a fallback, 2396 * but we still get fallbacks from PUA code points as usual. 2397 * Therefore, we will also need to check that we don't overwrite 2398 * an early fallback with a later one. 2399 */ 2400 usingFallback = useFallback; 2401 2402 for (i = 0; i < choiceCount && len <= 0; ++i) { 2403 byte cs0 = choices[i]; 2404 if (cs0 > 0) { 2405 int[] value = new int[1]; 2406 int len2; 2407 if (cs0 > CNS_11643_0) { 2408 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[CNS_11643]; 2409 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_3; 2410 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback); 2411 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[CNS_11643], 2412 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_3); 2413 if (len2 == 3 || (len2 == -3 && len == 0)) { 2414 targetValue = value[0]; 2415 cs = (byte)(CNS_11643_0 + (value[0] >> 16) - 0x80); 2416 if (len2 >= 0) { 2417 len = 2; 2418 } else { 2419 len = -2; 2420 usingFallback = false; 2421 } 2422 if (cs == CNS_11643_1) { 2423 g = 1; 2424 } else if (cs == CNS_11643_2) { 2425 g = 2; 2426 } else if (myConverterData.version == 1) { /* plane 3..7 */ 2427 g = 3; 2428 } else { 2429 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 2430 len = 0; 2431 } 2432 } 2433 } else { 2434 /* GB2312_1 or ISO-IR-165 */ 2435 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0]; 2436 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2; 2437 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback); 2438 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], 2439 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2); 2440 if (len2 == 2 || (len2 == -2 && len == 0)) { 2441 targetValue = value[0]; 2442 len = len2; 2443 cs = cs0; 2444 g = 1; 2445 usingFallback = false; 2446 } 2447 } 2448 } 2449 } 2450 2451 if (len != 0) { 2452 len = 0; /* count output bytes; it must have ben abs(len) == 2 */ 2453 2454 /* write the designation sequence if necessary */ 2455 if (cs != myConverterData.fromU2022State.cs[g]) { 2456 if (cs < CNS_11643) { 2457 for (int n = 0; n < escSeqCharsCN[cs].length; n++) { 2458 buffer[n] = escSeqCharsCN[cs][n]; 2459 } 2460 } else { 2461 for (int n = 0; n < escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)].length; n++) { 2462 buffer[n] = escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)][n]; 2463 } 2464 } 2465 len = 4; 2466 myConverterData.fromU2022State.cs[g] = cs; 2467 if (g == 1) { 2468 /* changing the SO/G1 charset invalidates the choices[] */ 2469 choiceCount = 0; 2470 } 2471 } 2472 2473 /* write the shift sequence if necessary */ 2474 if (g != myConverterData.fromU2022State.g) { 2475 switch (g) { 2476 case 1: 2477 buffer[len++] = UConverterConstants.SO; 2478 2479 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 2480 myConverterData.fromU2022State.g = 1; 2481 break; 2482 case 2: 2483 buffer[len++] = 0x1b; 2484 buffer[len++] = 0x4e; 2485 break; 2486 default: /* case 3 */ 2487 buffer[len++] = 0x1b; 2488 buffer[len++] = 0x4f; 2489 break; 2490 } 2491 } 2492 2493 /* write the two output bytes */ 2494 buffer[len++] = (byte)(targetValue >> 8); 2495 buffer[len++] = (byte)targetValue; 2496 } else { 2497 /* if we cannot find the character after checking all codepages 2498 * then this is an error 2499 */ 2500 err = CoderResult.unmappableForLength(source.position()-oldSourcePos); 2501 fromUChar32 = sourceChar; 2502 break; 2503 } 2504 } 2505 /* output len>0 bytes in buffer[] */ 2506 if (len == 1) { 2507 target.put(buffer[0]); 2508 if (offsets != null) { 2509 offsets.put(source.position()-1); 2510 } 2511 } else if (len == 2 && (target.remaining() >= 2)) { 2512 target.put(buffer[0]); 2513 target.put(buffer[1]); 2514 if (offsets != null) { 2515 int sourceIndex = source.position(); 2516 offsets.put(sourceIndex); 2517 offsets.put(sourceIndex); 2518 } 2519 } else { 2520 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, len, target, offsets, source.position()-1); 2521 if (err.isError()) { 2522 break; 2523 } 2524 } 2525 } else { 2526 err = CoderResult.OVERFLOW; 2527 break; 2528 } 2529 } /* end while (source.hasRemaining() */ 2530 2531 /* 2532 * the end of the input stream and detection of truncated input 2533 * are handled by the framework, but for ISO-2022-CN conversion 2534 * we need to be in ASCII mode at the very end 2535 * 2536 * condtions: 2537 * succesful 2538 * not in ASCII mode 2539 * end of input and no truncated input 2540 */ 2541 if (!err.isError() && myConverterData.fromU2022State.g != 0 && flush && !source.hasRemaining() && fromUChar32 == 0) { 2542 int sourceIndex; 2543 2544 /* we are switching to ASCII */ 2545 myConverterData.fromU2022State.g = 0; 2546 2547 /* get the source index of the last input character */ 2548 sourceIndex = source.position(); 2549 if (sourceIndex > 0) { 2550 --sourceIndex; 2551 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && 2552 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) { 2553 --sourceIndex; 2554 } 2555 } else { 2556 sourceIndex = -1; 2557 } 2558 2559 err = CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex); 2560 } 2561 2562 return err; 2563 } 2564 } 2565 /******************************** ISO-2022-KR *****************************/ 2566 /* 2567 * Rules for ISO-2022-KR encoding 2568 * i) The KSC5601 designator sequence should appear only once in a file, 2569 * at the begining of a line before any KSC5601 characters. This usually 2570 * means that it appears by itself on the first line of the file 2571 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2572 * and SI to shift into single byte mode 2573 */ 2574 private class CharsetEncoderISO2022KR extends CharsetEncoderICU { 2575 public CharsetEncoderISO2022KR(CharsetICU cs) { 2576 super(cs, fromUSubstitutionChar[myConverterData.version]); 2577 } 2578 2579 @Override 2580 protected void implReset() { 2581 super.implReset(); 2582 myConverterData.reset(); 2583 setInitialStateFromUnicodeKR(this); 2584 } 2585 2586 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */ 2587 @Override 2588 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder, 2589 CharBuffer source, ByteBuffer target, IntBuffer offsets){ 2590 CoderResult err = CoderResult.UNDERFLOW; 2591 byte[] buffer = new byte[8]; 2592 int length, i = 0; 2593 byte[] subchar; 2594 2595 subchar = encoder.replacement(); 2596 length = subchar.length; 2597 2598 if (myConverterData.version == 0) { 2599 if (length == 1) { 2600 if (encoder.fromUnicodeStatus != 0) { 2601 /* in DBCS mode: switch to SBCS */ 2602 encoder.fromUnicodeStatus = 0; 2603 buffer[i++] = UConverterConstants.SI; 2604 } 2605 buffer[i++] = subchar[0]; 2606 } else { /* length == 2 */ 2607 if (encoder.fromUnicodeStatus == 0) { 2608 /* in SBCS mode: switch to DBCS */ 2609 encoder.fromUnicodeStatus = 1; 2610 buffer[i++] = UConverterConstants.SO; 2611 } 2612 buffer[i++] = subchar[0]; 2613 buffer[i++] = subchar[1]; 2614 } 2615 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1); 2616 } else { 2617 /* save the subvonverter's substitution string */ 2618 byte[] currentSubChars = myConverterData.currentEncoder.replacement(); 2619 2620 /* set our substitution string into the subconverter */ 2621 myConverterData.currentEncoder.replaceWith(subchar); 2622 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0]; 2623 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 2624 myConverterData.currentEncoder.fromUChar32 = encoder.fromUChar32; 2625 err = myConverterData.currentEncoder.cbFromUWriteSub(myConverterData.currentEncoder, source, target, offsets); 2626 encoder.fromUChar32 = myConverterData.currentEncoder.fromUChar32; 2627 2628 /* restore the subconverter's substitution string */ 2629 myConverterData.currentEncoder.replaceWith(currentSubChars); 2630 2631 if (err.isOverflow()) { 2632 if (myConverterData.currentEncoder.errorBufferLength > 0) { 2633 encoder.errorBuffer = myConverterData.currentEncoder.errorBuffer.clone(); 2634 } 2635 encoder.errorBufferLength = myConverterData.currentEncoder.errorBufferLength; 2636 myConverterData.currentEncoder.errorBufferLength = 0; 2637 } 2638 } 2639 2640 return err; 2641 } 2642 2643 private CoderResult encodeLoopIBM(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 2644 CoderResult err = CoderResult.UNDERFLOW; 2645 2646 myConverterData.currentEncoder.fromUChar32 = fromUChar32; 2647 err = myConverterData.currentEncoder.cnvMBCSFromUnicodeWithOffsets(source, target, offsets, flush); 2648 fromUChar32 = myConverterData.currentEncoder.fromUChar32; 2649 2650 if (err.isOverflow()) { 2651 if (myConverterData.currentEncoder.errorBufferLength > 0) { 2652 errorBuffer = myConverterData.currentEncoder.errorBuffer.clone(); 2653 } 2654 errorBufferLength = myConverterData.currentEncoder.errorBufferLength; 2655 myConverterData.currentEncoder.errorBufferLength = 0; 2656 } 2657 2658 return err; 2659 } 2660 2661 @Override 2662 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 2663 CoderResult err = CoderResult.UNDERFLOW; 2664 int[] targetByteUnit = { 0x0000 }; 2665 int sourceChar = 0x0000; 2666 boolean isTargetByteDBCS; 2667 boolean oldIsTargetByteDBCS; 2668 boolean usingFallback; 2669 int length = 0; 2670 boolean gotoGetTrail = false; // for goto getTrail label call 2671 2672 /* 2673 * if the version is 1 then the user is requesting 2674 * conversion with ibm-25546 pass the argument to 2675 * MBCS converter and return 2676 */ 2677 if (myConverterData.version == 1) { 2678 return encodeLoopIBM(source, target, offsets, flush); 2679 } 2680 2681 usingFallback = useFallback; 2682 isTargetByteDBCS = fromUnicodeStatus == 0 ? false : true; 2683 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) { 2684 gotoGetTrail = true; 2685 } 2686 2687 while (source.hasRemaining() || gotoGetTrail) { 2688 targetByteUnit[0] = UConverterConstants.missingCharMarker; 2689 2690 if (target.hasRemaining() || gotoGetTrail) { 2691 if (!gotoGetTrail) { 2692 sourceChar = source.get(); 2693 2694 /* do not convert SO/SI/ESC */ 2695 if (IS_2022_CONTROL(sourceChar)) { 2696 /* callback(illegal) */ 2697 err = CoderResult.malformedForLength(1); 2698 fromUChar32 = sourceChar; 2699 break; 2700 } 2701 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2; 2702 length = myConverterData.currentEncoder.fromUChar32(sourceChar, targetByteUnit, usingFallback); 2703 //length = MBCSFromUChar32_ISO2022(myConverterData.currentConverter.sharedData, sourceChar, targetByteUnit, usingFallback, CharsetMBCS.MBCS_OUTPUT_2); 2704 if (length < 0) { 2705 length = -length; /* fallback */ 2706 } 2707 /* only DBCS or SBCS characters are expected */ 2708 /* DB characters with high bit set to 1 are expected */ 2709 if (length > 2 || length == 0 || 2710 (length == 1 && targetByteUnit[0] > 0x7f) || 2711 (length ==2 && 2712 ((char)(targetByteUnit[0] - 0xa1a1) > (0xfefe - 0xa1a1) || 2713 ((targetByteUnit[0] - 0xa1) & UConverterConstants.UNSIGNED_BYTE_MASK) > (0xfe - 0xa1)))) { 2714 targetByteUnit[0] = UConverterConstants.missingCharMarker; 2715 } 2716 } 2717 if (!gotoGetTrail && targetByteUnit[0] != UConverterConstants.missingCharMarker) { 2718 oldIsTargetByteDBCS = isTargetByteDBCS; 2719 isTargetByteDBCS = (targetByteUnit[0] > 0x00FF); 2720 /* append the shift sequence */ 2721 if (oldIsTargetByteDBCS != isTargetByteDBCS) { 2722 if (isTargetByteDBCS) { 2723 target.put((byte)UConverterConstants.SO); 2724 } else { 2725 target.put((byte)UConverterConstants.SI); 2726 } 2727 if (offsets != null) { 2728 offsets.put(source.position()-1); 2729 } 2730 } 2731 /* write the targetUniChar to target */ 2732 if (targetByteUnit[0] <= 0x00FF) { 2733 if (target.hasRemaining()) { 2734 target.put((byte)targetByteUnit[0]); 2735 if (offsets != null) { 2736 offsets.put(source.position()-1); 2737 } 2738 } else { 2739 errorBuffer[errorBufferLength++] = (byte)targetByteUnit[0]; 2740 err = CoderResult.OVERFLOW; 2741 } 2742 } else { 2743 if (target.hasRemaining()) { 2744 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80))); 2745 if (offsets != null) { 2746 offsets.put(source.position()-1); 2747 } 2748 if (target.hasRemaining()) { 2749 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80))); 2750 if (offsets != null) { 2751 offsets.put(source.position()-1); 2752 } 2753 } else { 2754 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0] - 0x80)); 2755 err = CoderResult.OVERFLOW; 2756 } 2757 2758 } else { 2759 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80)); 2760 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80)); 2761 err = CoderResult.OVERFLOW; 2762 } 2763 } 2764 } else { 2765 /* oops.. the code point is unassigned 2766 * set the error and reason 2767 */ 2768 2769 /* check if the char is a First surrogate */ 2770 if (gotoGetTrail || UTF16.isSurrogate((char)sourceChar)) { 2771 if (gotoGetTrail || UTF16.isLeadSurrogate((char)sourceChar)) { 2772// getTrail label 2773 // reset gotoGetTrail flag 2774 gotoGetTrail = false; 2775 2776 /* look ahead to find the trail surrogate */ 2777 if (source.hasRemaining()) { 2778 /* test the following code unit */ 2779 char trail = source.get(); 2780 source.position(source.position()-1); 2781 if (UTF16.isTrailSurrogate(trail)) { 2782 source.get(); 2783 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail); 2784 err = CoderResult.unmappableForLength(2); 2785 /* convert this surrogate code point */ 2786 /* exit this condition tree */ 2787 } else { 2788 /* this is an unmatched lead code unit (1st surrogate) */ 2789 /* callback(illegal) */ 2790 err = CoderResult.malformedForLength(1); 2791 } 2792 } else { 2793 /* no more input */ 2794 err = CoderResult.UNDERFLOW; 2795 } 2796 } else { 2797 /* this is an unmatched trail code unit (2nd surrogate ) */ 2798 /* callback(illegal) */ 2799 err = CoderResult.malformedForLength(1); 2800 } 2801 } else { 2802 /* callback(unassigned) for a BMP code point */ 2803 err = CoderResult.unmappableForLength(1); 2804 } 2805 2806 fromUChar32 = sourceChar; 2807 break; 2808 } 2809 } else { 2810 err = CoderResult.OVERFLOW; 2811 break; 2812 } 2813 } 2814 /* 2815 * the end of the input stream and detection of truncated input 2816 * are handled by the framework, but for ISO-2022-KR conversion 2817 * we need to be inASCII mode at the very end 2818 * 2819 * conditions: 2820 * successful 2821 * not in ASCII mode 2822 * end of input and no truncated input 2823 */ 2824 if (!err.isError() && isTargetByteDBCS && flush && !source.hasRemaining() && fromUChar32 == 0) { 2825 int sourceIndex; 2826 2827 /* we are switching to ASCII */ 2828 isTargetByteDBCS = false; 2829 2830 /* get the source index of the last input character */ 2831 sourceIndex = source.position(); 2832 if (sourceIndex > 0) { 2833 --sourceIndex; 2834 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && UTF16.isLeadSurrogate(source.get(sourceIndex-1))) { 2835 --sourceIndex; 2836 } 2837 } else { 2838 sourceIndex = -1; 2839 } 2840 2841 CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex); 2842 } 2843 /*save the state and return */ 2844 fromUnicodeStatus = isTargetByteDBCS ? 1 : 0; 2845 2846 return err; 2847 } 2848 } 2849 2850 @Override 2851 public CharsetDecoder newDecoder() { 2852 switch (variant) { 2853 case ISO_2022_JP: 2854 return new CharsetDecoderISO2022JP(this); 2855 2856 case ISO_2022_CN: 2857 return new CharsetDecoderISO2022CN(this); 2858 2859 case ISO_2022_KR: 2860 setInitialStateToUnicodeKR(); 2861 return new CharsetDecoderISO2022KR(this); 2862 2863 default: /* should not happen */ 2864 return null; 2865 } 2866 } 2867 2868 @Override 2869 public CharsetEncoder newEncoder() { 2870 CharsetEncoderICU cnv; 2871 2872 switch (variant) { 2873 case ISO_2022_JP: 2874 return new CharsetEncoderISO2022JP(this); 2875 2876 case ISO_2022_CN: 2877 return new CharsetEncoderISO2022CN(this); 2878 2879 case ISO_2022_KR: 2880 cnv = new CharsetEncoderISO2022KR(this); 2881 setInitialStateFromUnicodeKR(cnv); 2882 return cnv; 2883 2884 default: /* should not happen */ 2885 return null; 2886 } 2887 } 2888 2889 private void setInitialStateToUnicodeKR() { 2890 if (myConverterData.version == 1) { 2891 myConverterData.currentDecoder.toUnicodeStatus = 0; /* offset */ 2892 myConverterData.currentDecoder.mode = 0; /* state */ 2893 myConverterData.currentDecoder.toULength = 0; /* byteIndex */ 2894 } 2895 } 2896 private void setInitialStateFromUnicodeKR(CharsetEncoderICU cnv) { 2897 /* ISO-2022-KR the designator sequence appears only once 2898 * in a file so we append it only once 2899 */ 2900 if (cnv.errorBufferLength == 0) { 2901 cnv.errorBufferLength = 4; 2902 cnv.errorBuffer[0] = 0x1b; 2903 cnv.errorBuffer[1] = 0x24; 2904 cnv.errorBuffer[2] = 0x29; 2905 cnv.errorBuffer[3] = 0x43; 2906 } 2907 if (myConverterData.version == 1) { 2908 ((CharsetMBCS)myConverterData.currentEncoder.charset()).subChar1 = 0x1A; 2909 myConverterData.currentEncoder.fromUChar32 = 0; 2910 myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */ 2911 } 2912 } 2913 2914 @Override 2915 void getUnicodeSetImpl(UnicodeSet setFillIn, int which) { 2916 int i; 2917 /*open a set and initialize it with code points that are algorithmically round-tripped */ 2918 2919 switch(variant){ 2920 case ISO_2022_JP: 2921 /*include JIS X 0201 which is hardcoded */ 2922 setFillIn.add(0xa5); 2923 setFillIn.add(0x203e); 2924 if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){ 2925 /*include Latin-1 some variants of JP */ 2926 setFillIn.add(0, 0xff); 2927 2928 } 2929 else { 2930 /* include ASCII for JP */ 2931 setFillIn.add(0, 0x7f); 2932 } 2933 if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){ 2934 /* 2935 * Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit 2936 * is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana. 2937 * This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width 2938 * Katakana via ESC. 2939 * However, we only emit (fromUnicode) half-width Katakana according to the 2940 * definition of each variant. 2941 * 2942 * When including fallbacks, 2943 * we need to include half-width Katakana Unicode code points for all JP variants because 2944 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 2945 */ 2946 /* include half-width Katakana for JP */ 2947 setFillIn.add(HWKANA_START, HWKANA_END); 2948 } 2949 break; 2950 case ISO_2022_CN: 2951 /* Include ASCII for CN */ 2952 setFillIn.add(0, 0x7f); 2953 break; 2954 case ISO_2022_KR: 2955 /* there is only one converter for KR */ 2956 myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which); 2957 break; 2958 default: 2959 break; 2960 } 2961 2962 //TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until 2963 for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){ 2964 int filter; 2965 if(myConverterData.myConverterArray[i]!=null){ 2966 if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){ 2967 /* 2968 * 2969 * version -specific for CN: 2970 * CN version 0 does not map CNS planes 3..7 although 2971 * they are all available in the CNS conversion table; 2972 * CN version 1 (-EXT) does map them all. 2973 * The two versions create different Unicode sets. 2974 */ 2975 filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN; 2976 } else if(variant==ISO_2022_JP && i == JISX208){ 2977 /* 2978 * Only add code points that map to Shift-JIS codes 2979 * corrosponding to JIS X 208 2980 */ 2981 filter=CharsetMBCS.UCNV_SET_FILTER_SJIS; 2982 } else if(i==KSC5601){ 2983 /* 2984 * Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables) 2985 * are broader than GR94. 2986 */ 2987 filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS; 2988 } else { 2989 filter=CharsetMBCS.UCNV_SET_FILTER_NONE; 2990 } 2991 2992 myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter); 2993 } 2994 } 2995 /* 2996 * ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves 2997 * Remove these characters from the set. 2998 */ 2999 setFillIn.remove(0x0e); 3000 setFillIn.remove(0x0f); 3001 setFillIn.remove(0x1b); 3002 3003 /* ISO 2022 converter do not convert C! controls either */ 3004 setFillIn.remove(0x80, 0x9f); 3005 } 3006} 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016