ucnvmbcs.c revision b13da9df870a61b11249bf741347908dbea0edd8
1/* 2****************************************************************************** 3* 4* Copyright (C) 2000-2007, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* file name: ucnvmbcs.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2000jul03 14* created by: Markus W. Scherer 15* 16* The current code in this file replaces the previous implementation 17* of conversion code from multi-byte codepages to Unicode and back. 18* This implementation supports the following: 19* - legacy variable-length codepages with up to 4 bytes per character 20* - all Unicode code points (up to 0x10ffff) 21* - efficient distinction of unassigned vs. illegal byte sequences 22* - it is possible in fromUnicode() to directly deal with simple 23* stateful encodings (used for EBCDIC_STATEFUL) 24* - it is possible to convert Unicode code points 25* to a single zero byte (but not as a fallback except for SBCS) 26* 27* Remaining limitations in fromUnicode: 28* - byte sequences must not have leading zero bytes 29* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30* - limitation to up to 4 bytes per character 31* 32* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33* limitations and adds m:n character mappings and other features. 34* See ucnv_ext.h for details. 35* 36* Change history: 37* 38* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40* macros to ucnvmbcs.h file 41*/ 42 43#include "unicode/utypes.h" 44 45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47#include "unicode/ucnv.h" 48#include "unicode/ucnv_cb.h" 49#include "unicode/udata.h" 50#include "unicode/uset.h" 51#include "ucnv_bld.h" 52#include "ucnvmbcs.h" 53#include "ucnv_ext.h" 54#include "ucnv_cnv.h" 55#include "umutex.h" 56#include "cmemory.h" 57#include "cstring.h" 58 59/* control optimizations according to the platform */ 60#define MBCS_UNROLL_SINGLE_TO_BMP 1 61#define MBCS_UNROLL_SINGLE_FROM_BMP 0 62 63/* 64 * _MBCSHeader version 4.3 65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 66 * 67 * Change from version 4.2: 68 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 69 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 70 * files which can be used instead of stages 1 & 2. 71 * Faster lookups for roundtrips from most commonly used characters, 72 * and lookups from UTF-8 byte sequences with a natural bit distribution. 73 * See ucnvmbcs.h for more details. 74 * 75 * Change from version 4.1: 76 * - Added an optional extension table structure at the end of the .cnv file. 77 * It is present if the upper bits of the header flags field contains a non-zero 78 * byte offset to it. 79 * Files that contain only a conversion table and no base table 80 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 81 * These contain the base table name between the MBCS header and the extension 82 * data. 83 * 84 * Change from version 4.0: 85 * - Replace header.reserved with header.fromUBytesLength so that all 86 * fields in the data have length. 87 * 88 * Changes from version 3 (for performance improvements): 89 * - new bit distribution for state table entries 90 * - reordered action codes 91 * - new data structure for single-byte fromUnicode 92 * + stage 2 only contains indexes 93 * + stage 3 stores 16 bits per character with classification bits 15..8 94 * - no multiplier for stage 1 entries 95 * - stage 2 for non-single-byte codepages contains the index and the flags in 96 * one 32-bit value 97 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 98 * 99 * For more details about old versions of the MBCS data structure, see 100 * the corresponding versions of this file. 101 * 102 * Converting stateless codepage data ---------------------------------------*** 103 * (or codepage data with simple states) to Unicode. 104 * 105 * Data structure and algorithm for converting from complex legacy codepages 106 * to Unicode. (Designed before 2000-may-22.) 107 * 108 * The basic idea is that the structure of legacy codepages can be described 109 * with state tables. 110 * When reading a byte stream, each input byte causes a state transition. 111 * Some transitions result in the output of a code point, some result in 112 * "unassigned" or "illegal" output. 113 * This is used here for character conversion. 114 * 115 * The data structure begins with a state table consisting of a row 116 * per state, with 256 entries (columns) per row for each possible input 117 * byte value. 118 * Each entry is 32 bits wide, with two formats distinguished by 119 * the sign bit (bit 31): 120 * 121 * One format for transitional entries (bit 31 not set) for non-final bytes, and 122 * one format for final entries (bit 31 set). 123 * Both formats contain the number of the next state in the same bit 124 * positions. 125 * State 0 is the initial state. 126 * 127 * Most of the time, the offset values of subsequent states are added 128 * up to a scalar value. This value will eventually be the index of 129 * the Unicode code point in a table that follows the state table. 130 * The effect is that the code points for final state table rows 131 * are contiguous. The code points of final state rows follow each other 132 * in the order of the references to those final states by previous 133 * states, etc. 134 * 135 * For some terminal states, the offset is itself the output Unicode 136 * code point (16 bits for a BMP code point or 20 bits for a supplementary 137 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 138 * For others, the code point in the Unicode table is stored with either 139 * one or two code units: one for BMP code points, two for a pair of 140 * surrogates. 141 * All code points for a final state entry take up the same number of code 142 * units, regardless of whether they all actually _use_ the same number 143 * of code units. This is necessary for simple array access. 144 * 145 * An additional feature comes in with what in ICU is called "fallback" 146 * mappings: 147 * 148 * In addition to round-trippable, precise, 1:1 mappings, there are often 149 * mappings defined between similar, though not the same, characters. 150 * Typically, such mappings occur only in fromUnicode mapping tables because 151 * Unicode has a superset repertoire of most other codepages. However, it 152 * is possible to provide such mappings in the toUnicode tables, too. 153 * In this case, the fallback mappings are partly integrated into the 154 * general state tables because the structure of the encoding includes their 155 * byte sequences. 156 * For final entries in an initial state, fallback mappings are stored in 157 * the entry itself like with roundtrip mappings. 158 * For other final entries, they are stored in the code units table if 159 * the entry is for a pair of code units. 160 * For single-unit results in the code units table, there is no space to 161 * alternatively hold a fallback mapping; in this case, the code unit 162 * is stored as U+fffe (unassigned), and the fallback mapping needs to 163 * be looked up by the scalar offset value in a separate table. 164 * 165 * "Unassigned" state entries really mean "structurally unassigned", 166 * i.e., such a byte sequence will never have a mapping result. 167 * 168 * The interpretation of the bits in each entry is as follows: 169 * 170 * Bit 31 not set, not a terminal entry ("transitional"): 171 * 30..24 next state 172 * 23..0 offset delta, to be added up 173 * 174 * Bit 31 set, terminal ("final") entry: 175 * 30..24 next state (regardless of action code) 176 * 23..20 action code: 177 * action codes 0 and 1 result in precise-mapping Unicode code points 178 * 0 valid byte sequence 179 * 19..16 not used, 0 180 * 15..0 16-bit Unicode BMP code point 181 * never U+fffe or U+ffff 182 * 1 valid byte sequence 183 * 19..0 20-bit Unicode supplementary code point 184 * never U+fffe or U+ffff 185 * 186 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 187 * 2 valid byte sequence (fallback) 188 * 19..16 not used, 0 189 * 15..0 16-bit Unicode BMP code point as fallback result 190 * 3 valid byte sequence (fallback) 191 * 19..0 20-bit Unicode supplementary code point as fallback result 192 * 193 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 194 * depending on the code units they result in 195 * 4 valid byte sequence 196 * 19..9 not used, 0 197 * 8..0 final offset delta 198 * pointing to one 16-bit code unit which may be 199 * fffe unassigned -- look for a fallback for this offset 200 * ffff illegal 201 * 5 valid byte sequence 202 * 19..9 not used, 0 203 * 8..0 final offset delta 204 * pointing to two 16-bit code units 205 * (typically UTF-16 surrogates) 206 * the result depends on the first code unit as follows: 207 * 0000..d7ff roundtrip BMP code point (1st alone) 208 * d800..dbff roundtrip surrogate pair (1st, 2nd) 209 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 210 * e000 roundtrip BMP code point (2nd alone) 211 * e001 fallback BMP code point (2nd alone) 212 * fffe unassigned 213 * ffff illegal 214 * (the final offset deltas are at most 255 * 2, 215 * times 2 because of storing code unit pairs) 216 * 217 * 6 unassigned byte sequence 218 * 19..16 not used, 0 219 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 220 * this does not contain a final offset delta because the main 221 * purpose of this action code is to save scalar offset values; 222 * therefore, fallback values cannot be assigned to byte 223 * sequences that result in this action code 224 * 7 illegal byte sequence 225 * 19..16 not used, 0 226 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 227 * 8 state change only 228 * 19..0 not used, 0 229 * useful for state changes in simple stateful encodings, 230 * at Shift-In/Shift-Out codes 231 * 232 * 233 * 9..15 reserved for future use 234 * current implementations will only perform a state change 235 * and ignore bits 19..0 236 * 237 * An encoding with contiguous ranges of unassigned byte sequences, like 238 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 239 * at least two states for the trail bytes: 240 * One trail byte state that results in code points, and one that only 241 * has "unassigned" and "illegal" terminal states. 242 * 243 * Note: partly by accident, this data structure supports simple stateful 244 * encodings without any additional logic. 245 * Currently, only simple Shift-In/Shift-Out schemes are handled with 246 * appropriate state tables (especially EBCDIC_STATEFUL!). 247 * 248 * MBCS version 2 added: 249 * unassigned and illegal action codes have U+fffe and U+ffff 250 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 251 * 252 * Converting from Unicode to codepage bytes --------------------------------*** 253 * 254 * The conversion data structure for fromUnicode is designed for the known 255 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 256 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 257 * a roundtrip mapping. 258 * 259 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 260 * like in the character properties table. 261 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 262 * with the resulting bytes is at offsetFromUBytes. 263 * 264 * Beginning with version 4, single-byte codepages have a significantly different 265 * trie compared to other codepages. 266 * In all cases, the entry in stage 1 is directly the index of the block of 267 * 64 entries in stage 2. 268 * 269 * Single-byte lookup: 270 * 271 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 272 * Stage 3 contains one 16-bit word per result: 273 * Bits 15..8 indicate the kind of result: 274 * f roundtrip result 275 * c fallback result from private-use code point 276 * 8 fallback result from other code points 277 * 0 unassigned 278 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 279 * 280 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 281 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 282 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 283 * ASCII code points can be looked up with a linear array access into stage 3. 284 * See maxFastUChar and other details in ucnvmbcs.h. 285 * 286 * Multi-byte lookup: 287 * 288 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 289 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 290 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 291 * If this test is false, then a non-zero result will be interpreted as 292 * a fallback mapping. 293 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 294 * 295 * Stage 3 contains 2, 3, or 4 bytes per result. 296 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 297 * while 3 bytes are stored as bytes in big-endian order. 298 * Leading zero bytes are ignored, and the number of bytes is counted. 299 * A zero byte mapping result is possible as a roundtrip result. 300 * For some output types, the actual result is processed from this; 301 * see ucnv_MBCSFromUnicodeWithOffsets(). 302 * 303 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 304 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 305 * 306 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 307 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 308 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 309 * ASCII code points can be looked up with a linear array access into stage 3. 310 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 311 * 312 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 313 * for compaction. 314 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 315 * may overlap by any number of entries. 316 * 317 * MBCS version 2 added: 318 * the converter checks for known output types, which allows 319 * adding new ones without crashing an unaware converter 320 */ 321 322static const UConverterImpl _SBCSUTF8Impl; 323static const UConverterImpl _DBCSUTF8Impl; 324 325/* GB 18030 data ------------------------------------------------------------ */ 326 327/* helper macros for linear values for GB 18030 four-byte sequences */ 328#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 329 330#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 331 332#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 333 334/* 335 * Some ranges of GB 18030 where both the Unicode code points and the 336 * GB four-byte sequences are contiguous and are handled algorithmically by 337 * the special callback functions below. 338 * The values are start & end of Unicode & GB codes. 339 * 340 * Note that single surrogates are not mapped by GB 18030 341 * as of the re-released mapping tables from 2000-nov-30. 342 */ 343static const uint32_t 344gb18030Ranges[13][4]={ 345 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 346 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 347 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)}, 348 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 349 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 350 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 351 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 352 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 353 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 354 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 355 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 356 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 357 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 358}; 359 360/* bit flag for UConverter.options indicating GB 18030 special handling */ 361#define _MBCS_OPTION_GB18030 0x8000 362 363/* Miscellaneous ------------------------------------------------------------ */ 364 365/* similar to ucnv_MBCSGetNextUChar() but recursive */ 366static void 367_getUnicodeSetForBytes(const UConverterSharedData *sharedData, 368 const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits, 369 const USetAdder *sa, 370 UConverterUnicodeSet which, 371 uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte, 372 373 UErrorCode *pErrorCode) { 374 int32_t b, entry; 375 376 for(b=lowByte; b<=highByte; ++b) { 377 entry=stateTable[state][b]; 378 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 379 _getUnicodeSetForBytes( 380 sharedData, stateTable, unicodeCodeUnits, 381 sa, which, 382 (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry), 383 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 384 0, 0xff, 385 pErrorCode); 386 } else { 387 UChar32 c; 388 int32_t rowOffset=offset; 389 uint8_t action; 390 391 c=U_SENTINEL; 392 393 /* 394 * An if-else-if chain provides more reliable performance for 395 * the most common cases compared to a switch. 396 */ 397 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 398 if(action==MBCS_STATE_VALID_DIRECT_16) { 399 /* output BMP code point */ 400 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 401 } else if(action==MBCS_STATE_VALID_16) { 402 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 403 c=unicodeCodeUnits[offset]; 404 if(c<0xfffe) { 405 /* output BMP code point */ 406 } else { 407 c=U_SENTINEL; 408 } 409 } else if(action==MBCS_STATE_VALID_16_PAIR) { 410 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 411 c=unicodeCodeUnits[offset++]; 412 if(c<0xd800) { 413 /* output BMP code point below 0xd800 */ 414 } else if(c<=0xdbff) { 415 /* output roundtrip or fallback supplementary code point */ 416 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 417 } else if(c==0xe000) { 418 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 419 c=unicodeCodeUnits[offset]; 420 } else { 421 c=U_SENTINEL; 422 } 423 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 424 /* output supplementary code point */ 425 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 426 } 427 428 if(c>=0) { 429 sa->add(sa->set, c); 430 } 431 offset=rowOffset; 432 } 433 } 434} 435 436/* 437 * Internal function returning a UnicodeSet for toUnicode() conversion. 438 * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. 439 * In the future, if we add support for reverse-fallback sets, this function 440 * needs to be updated, and called for each initial state. 441 * Does not currently handle extensions. 442 * Does not empty the set first. 443 */ 444U_CFUNC void 445ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, 446 const USetAdder *sa, 447 UConverterUnicodeSet which, 448 uint8_t state, int32_t lowByte, int32_t highByte, 449 UErrorCode *pErrorCode) { 450 _getUnicodeSetForBytes( 451 sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits, 452 sa, which, 453 state, 0, lowByte, highByte, 454 pErrorCode); 455} 456 457U_CFUNC void 458ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 459 const USetAdder *sa, 460 UConverterUnicodeSet which, 461 UErrorCode *pErrorCode) { 462 const UConverterMBCSTable *mbcsTable; 463 const uint16_t *table; 464 465 uint32_t st3; 466 uint16_t st1, maxStage1, st2; 467 468 UChar32 c; 469 470 /* enumerate the from-Unicode trie table */ 471 mbcsTable=&sharedData->mbcs; 472 table=mbcsTable->fromUnicodeTable; 473 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 474 maxStage1=0x440; 475 } else { 476 maxStage1=0x40; 477 } 478 479 c=0; /* keep track of the current code point while enumerating */ 480 481 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 482 const uint16_t *stage2, *stage3, *results; 483 484 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 485 486 for(st1=0; st1<maxStage1; ++st1) { 487 st2=table[st1]; 488 if(st2>maxStage1) { 489 stage2=table+st2; 490 for(st2=0; st2<64; ++st2) { 491 if((st3=stage2[st2])!=0) { 492 /* read the stage 3 block */ 493 stage3=results+st3; 494 495 /* 496 * Add code points for which the roundtrip flag is set. 497 * Once we get a set for fallback mappings, we have to use 498 * a threshold variable with a value of 0x800. 499 * See ucnv_MBCSSingleFromBMPWithOffsets() and 500 * MBCS_SINGLE_RESULT_FROM_U() for details. 501 */ 502 do { 503 if(*stage3++>=0xf00) { 504 sa->add(sa->set, c); 505 } 506 } while((++c&0xf)!=0); 507 } else { 508 c+=16; /* empty stage 3 block */ 509 } 510 } 511 } else { 512 c+=1024; /* empty stage 2 block */ 513 } 514 } 515 } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { 516 /* ignore single-byte results */ 517 const uint32_t *stage2; 518 const uint16_t *stage3, *results; 519 520 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 521 522 for(st1=0; st1<maxStage1; ++st1) { 523 st2=table[st1]; 524 if(st2>(maxStage1>>1)) { 525 stage2=(const uint32_t *)table+st2; 526 for(st2=0; st2<64; ++st2) { 527 if((st3=stage2[st2])!=0) { 528 /* read the stage 3 block */ 529 stage3=results+16*(uint32_t)(uint16_t)st3; 530 531 /* get the roundtrip flags for the stage 3 block */ 532 st3>>=16; 533 534 /* 535 * Add code points for which the roundtrip flag is set. 536 * Once we get a set for fallback mappings, we have to check 537 * non-roundtrip stage 3 results for whether they are 0. 538 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 539 * 540 * Ignore single-byte results (<0x100). 541 */ 542 do { 543 if((st3&1)!=0 && *stage3>=0x100) { 544 sa->add(sa->set, c); 545 } 546 st3>>=1; 547 ++stage3; 548 } while((++c&0xf)!=0); 549 } else { 550 c+=16; /* empty stage 3 block */ 551 } 552 } 553 } else { 554 c+=1024; /* empty stage 2 block */ 555 } 556 } 557 } else { 558 const uint32_t *stage2; 559 560 for(st1=0; st1<maxStage1; ++st1) { 561 st2=table[st1]; 562 if(st2>(maxStage1>>1)) { 563 stage2=(const uint32_t *)table+st2; 564 for(st2=0; st2<64; ++st2) { 565 if((st3=stage2[st2])!=0) { 566 /* get the roundtrip flags for the stage 3 block */ 567 st3>>=16; 568 569 /* 570 * Add code points for which the roundtrip flag is set. 571 * Once we get a set for fallback mappings, we have to check 572 * non-roundtrip stage 3 results for whether they are 0. 573 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 574 */ 575 do { 576 if(st3&1) { 577 sa->add(sa->set, c); 578 } 579 st3>>=1; 580 } while((++c&0xf)!=0); 581 } else { 582 c+=16; /* empty stage 3 block */ 583 } 584 } 585 } else { 586 c+=1024; /* empty stage 2 block */ 587 } 588 } 589 } 590 591 ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); 592} 593 594static void 595ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 596 const USetAdder *sa, 597 UConverterUnicodeSet which, 598 UErrorCode *pErrorCode) { 599 if(cnv->options&_MBCS_OPTION_GB18030) { 600 sa->addRange(sa->set, 0, 0xd7ff); 601 sa->addRange(sa->set, 0xe000, 0x10ffff); 602 } else { 603 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 604 } 605} 606 607/* conversion extensions for input not in the main table -------------------- */ 608 609/* 610 * Hardcoded extension handling for GB 18030. 611 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 612 * 613 * In the future, conversion extensions may handle m:n mappings and delta tables, 614 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 615 * 616 * If an input character cannot be mapped, then these functions set an error 617 * code. The framework will then call the callback function. 618 */ 619 620/* 621 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 622 * else return 0 after output has been written to the target 623 */ 624static UChar32 625_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 626 UChar32 cp, 627 const UChar **source, const UChar *sourceLimit, 628 uint8_t **target, const uint8_t *targetLimit, 629 int32_t **offsets, int32_t sourceIndex, 630 UBool flush, 631 UErrorCode *pErrorCode) { 632 const int32_t *cx; 633 634 cnv->useSubChar1=FALSE; 635 636 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 637 ucnv_extInitialMatchFromU( 638 cnv, cx, 639 cp, source, sourceLimit, 640 (char **)target, (char *)targetLimit, 641 offsets, sourceIndex, 642 flush, 643 pErrorCode) 644 ) { 645 return 0; /* an extension mapping handled the input */ 646 } 647 648 /* GB 18030 */ 649 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 650 const uint32_t *range; 651 int32_t i; 652 653 range=gb18030Ranges[0]; 654 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 655 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 656 /* found the Unicode code point, output the four-byte sequence for it */ 657 uint32_t linear; 658 char bytes[4]; 659 660 /* get the linear value of the first GB 18030 code in this range */ 661 linear=range[2]-LINEAR_18030_BASE; 662 663 /* add the offset from the beginning of the range */ 664 linear+=((uint32_t)cp-range[0]); 665 666 /* turn this into a four-byte sequence */ 667 bytes[3]=(char)(0x30+linear%10); linear/=10; 668 bytes[2]=(char)(0x81+linear%126); linear/=126; 669 bytes[1]=(char)(0x30+linear%10); linear/=10; 670 bytes[0]=(char)(0x81+linear); 671 672 /* output this sequence */ 673 ucnv_fromUWriteBytes(cnv, 674 bytes, 4, (char **)target, (char *)targetLimit, 675 offsets, sourceIndex, pErrorCode); 676 return 0; 677 } 678 } 679 } 680 681 /* no mapping */ 682 *pErrorCode=U_INVALID_CHAR_FOUND; 683 return cp; 684} 685 686/* 687 * Input sequence: cnv->toUBytes[0..length[ 688 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 689 * else return 0 after output has been written to the target 690 */ 691static int8_t 692_extToU(UConverter *cnv, const UConverterSharedData *sharedData, 693 int8_t length, 694 const uint8_t **source, const uint8_t *sourceLimit, 695 UChar **target, const UChar *targetLimit, 696 int32_t **offsets, int32_t sourceIndex, 697 UBool flush, 698 UErrorCode *pErrorCode) { 699 const int32_t *cx; 700 701 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 702 ucnv_extInitialMatchToU( 703 cnv, cx, 704 length, (const char **)source, (const char *)sourceLimit, 705 target, targetLimit, 706 offsets, sourceIndex, 707 flush, 708 pErrorCode) 709 ) { 710 return 0; /* an extension mapping handled the input */ 711 } 712 713 /* GB 18030 */ 714 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 715 const uint32_t *range; 716 uint32_t linear; 717 int32_t i; 718 719 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 720 range=gb18030Ranges[0]; 721 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 722 if(range[2]<=linear && linear<=range[3]) { 723 /* found the sequence, output the Unicode code point for it */ 724 *pErrorCode=U_ZERO_ERROR; 725 726 /* add the linear difference between the input and start sequences to the start code point */ 727 linear=range[0]+(linear-range[2]); 728 729 /* output this code point */ 730 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 731 732 return 0; 733 } 734 } 735 } 736 737 /* no mapping */ 738 *pErrorCode=U_INVALID_CHAR_FOUND; 739 return length; 740} 741 742/* EBCDIC swap LF<->NL ------------------------------------------------------ */ 743 744/* 745 * This code modifies a standard EBCDIC<->Unicode mapping table for 746 * OS/390 (z/OS) Unix System Services (Open Edition). 747 * The difference is in the mapping of Line Feed and New Line control codes: 748 * Standard EBCDIC maps 749 * 750 * <U000A> \x25 |0 751 * <U0085> \x15 |0 752 * 753 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 754 * mapping 755 * 756 * <U000A> \x15 |0 757 * <U0085> \x25 |0 758 * 759 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 760 * by copying it into allocated memory and swapping the LF and NL values. 761 * It allows to support the same EBCDIC charset in both versions without 762 * duplicating the entire installed table. 763 */ 764 765/* standard EBCDIC codes */ 766#define EBCDIC_LF 0x25 767#define EBCDIC_NL 0x15 768 769/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 770#define EBCDIC_RT_LF 0xf25 771#define EBCDIC_RT_NL 0xf15 772 773/* Unicode code points */ 774#define U_LF 0x0a 775#define U_NL 0x85 776 777static UBool 778_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 779 UConverterMBCSTable *mbcsTable; 780 781 const uint16_t *table, *results; 782 const uint8_t *bytes; 783 784 int32_t (*newStateTable)[256]; 785 uint16_t *newResults; 786 uint8_t *p; 787 char *name; 788 789 uint32_t stage2Entry; 790 uint32_t size, sizeofFromUBytes; 791 792 mbcsTable=&sharedData->mbcs; 793 794 table=mbcsTable->fromUnicodeTable; 795 bytes=mbcsTable->fromUnicodeBytes; 796 results=(const uint16_t *)bytes; 797 798 /* 799 * Check that this is an EBCDIC table with SBCS portion - 800 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 801 * 802 * If not, ignore the option. Options are always ignored if they do not apply. 803 */ 804 if(!( 805 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 806 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 807 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 808 )) { 809 return FALSE; 810 } 811 812 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 813 if(!( 814 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 815 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 816 )) { 817 return FALSE; 818 } 819 } else /* MBCS_OUTPUT_2_SISO */ { 820 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 821 if(!( 822 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 823 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 824 )) { 825 return FALSE; 826 } 827 828 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 829 if(!( 830 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 831 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 832 )) { 833 return FALSE; 834 } 835 } 836 837 if(mbcsTable->fromUBytesLength>0) { 838 /* 839 * We _know_ the number of bytes in the fromUnicodeBytes array 840 * starting with header.version 4.1. 841 */ 842 sizeofFromUBytes=mbcsTable->fromUBytesLength; 843 } else { 844 /* 845 * Otherwise: 846 * There used to be code to enumerate the fromUnicode 847 * trie and find the highest entry, but it was removed in ICU 3.2 848 * because it was not tested and caused a low code coverage number. 849 * See Jitterbug 3674. 850 * This affects only some .cnv file formats with a header.version 851 * below 4.1, and only when swaplfnl is requested. 852 * 853 * ucnvmbcs.c revision 1.99 is the last one with the 854 * ucnv_MBCSSizeofFromUBytes() function. 855 */ 856 *pErrorCode=U_INVALID_FORMAT_ERROR; 857 return FALSE; 858 } 859 860 /* 861 * The table has an appropriate format. 862 * Allocate and build 863 * - a modified to-Unicode state table 864 * - a modified from-Unicode output array 865 * - a converter name string with the swap option appended 866 */ 867 size= 868 mbcsTable->countStates*1024+ 869 sizeofFromUBytes+ 870 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 871 p=(uint8_t *)uprv_malloc(size); 872 if(p==NULL) { 873 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 874 return FALSE; 875 } 876 877 /* copy and modify the to-Unicode state table */ 878 newStateTable=(int32_t (*)[256])p; 879 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 880 881 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 882 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 883 884 /* copy and modify the from-Unicode result table */ 885 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 886 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 887 888 /* conveniently, the table access macros work on the left side of expressions */ 889 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 890 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 891 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 892 } else /* MBCS_OUTPUT_2_SISO */ { 893 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 894 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 895 896 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 897 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 898 } 899 900 /* set the canonical converter name */ 901 name=(char *)newResults+sizeofFromUBytes; 902 uprv_strcpy(name, sharedData->staticData->name); 903 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 904 905 /* set the pointers */ 906 umtx_lock(NULL); 907 if(mbcsTable->swapLFNLStateTable==NULL) { 908 mbcsTable->swapLFNLStateTable=newStateTable; 909 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 910 mbcsTable->swapLFNLName=name; 911 912 newStateTable=NULL; 913 } 914 umtx_unlock(NULL); 915 916 /* release the allocated memory if another thread beat us to it */ 917 if(newStateTable!=NULL) { 918 uprv_free(newStateTable); 919 } 920 return TRUE; 921} 922 923/* MBCS setup functions ----------------------------------------------------- */ 924 925static void 926ucnv_MBCSLoad(UConverterSharedData *sharedData, 927 UConverterLoadArgs *pArgs, 928 const uint8_t *raw, 929 UErrorCode *pErrorCode) { 930 UDataInfo info; 931 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 932 _MBCSHeader *header=(_MBCSHeader *)raw; 933 uint32_t offset; 934 935 if(header->version[0]!=4) { 936 *pErrorCode=U_INVALID_TABLE_FORMAT; 937 return; 938 } 939 940 mbcsTable->outputType=(uint8_t)header->flags; 941 942 /* extension data, header version 4.2 and higher */ 943 offset=header->flags>>8; 944 if(offset!=0) { 945 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 946 } 947 948 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 949 UConverterLoadArgs args={ 0 }; 950 UConverterSharedData *baseSharedData; 951 const int32_t *extIndexes; 952 const char *baseName; 953 954 /* extension-only file, load the base table and set values appropriately */ 955 if((extIndexes=mbcsTable->extIndexes)==NULL) { 956 /* extension-only file without extension */ 957 *pErrorCode=U_INVALID_TABLE_FORMAT; 958 return; 959 } 960 961 if(pArgs->nestedLoads!=1) { 962 /* an extension table must not be loaded as a base table */ 963 *pErrorCode=U_INVALID_TABLE_FILE; 964 return; 965 } 966 967 /* load the base table */ 968 baseName=(const char *)(header+1); 969 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 970 /* forbid loading this same extension-only file */ 971 *pErrorCode=U_INVALID_TABLE_FORMAT; 972 return; 973 } 974 975 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 976 args.size=sizeof(UConverterLoadArgs); 977 args.nestedLoads=2; 978 args.reserved=pArgs->reserved; 979 args.options=pArgs->options; 980 args.pkg=pArgs->pkg; 981 args.name=baseName; 982 baseSharedData=ucnv_load(&args, pErrorCode); 983 if(U_FAILURE(*pErrorCode)) { 984 return; 985 } 986 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 987 baseSharedData->mbcs.baseSharedData!=NULL 988 ) { 989 ucnv_unload(baseSharedData); 990 *pErrorCode=U_INVALID_TABLE_FORMAT; 991 return; 992 } 993 994 /* copy the base table data */ 995 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 996 997 /* overwrite values with relevant ones for the extension converter */ 998 mbcsTable->baseSharedData=baseSharedData; 999 mbcsTable->extIndexes=extIndexes; 1000 1001 /* 1002 * It would be possible to share the swapLFNL data with a base converter, 1003 * but the generated name would have to be different, and the memory 1004 * would have to be free'd only once. 1005 * It is easier to just create the data for the extension converter 1006 * separately when it is requested. 1007 */ 1008 mbcsTable->swapLFNLStateTable=NULL; 1009 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1010 mbcsTable->swapLFNLName=NULL; 1011 1012 /* 1013 * Set a special, runtime-only outputType if the extension converter 1014 * is a DBCS version of a base converter that also maps single bytes. 1015 */ 1016 if( sharedData->staticData->conversionType==UCNV_DBCS || 1017 (sharedData->staticData->conversionType==UCNV_MBCS && 1018 sharedData->staticData->minBytesPerChar>=2) 1019 ) { 1020 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1021 /* the base converter is SI/SO-stateful */ 1022 int32_t entry; 1023 1024 /* get the dbcs state from the state table entry for SO=0x0e */ 1025 entry=mbcsTable->stateTable[0][0xe]; 1026 if( MBCS_ENTRY_IS_FINAL(entry) && 1027 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1028 MBCS_ENTRY_FINAL_STATE(entry)!=0 1029 ) { 1030 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1031 1032 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1033 } 1034 } else if( 1035 baseSharedData->staticData->conversionType==UCNV_MBCS && 1036 baseSharedData->staticData->minBytesPerChar==1 && 1037 baseSharedData->staticData->maxBytesPerChar==2 && 1038 mbcsTable->countStates<=127 1039 ) { 1040 /* non-stateful base converter, need to modify the state table */ 1041 int32_t (*newStateTable)[256]; 1042 int32_t *state; 1043 int32_t i, count; 1044 1045 /* allocate a new state table and copy the base state table contents */ 1046 count=mbcsTable->countStates; 1047 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1048 if(newStateTable==NULL) { 1049 ucnv_unload(baseSharedData); 1050 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1051 return; 1052 } 1053 1054 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1055 1056 /* change all final single-byte entries to go to a new all-illegal state */ 1057 state=newStateTable[0]; 1058 for(i=0; i<256; ++i) { 1059 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1060 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1061 } 1062 } 1063 1064 /* build the new all-illegal state */ 1065 state=newStateTable[count]; 1066 for(i=0; i<256; ++i) { 1067 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1068 } 1069 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1070 mbcsTable->countStates=(uint8_t)(count+1); 1071 mbcsTable->stateTableOwned=TRUE; 1072 1073 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1074 } 1075 } 1076 1077 /* 1078 * unlike below for files with base tables, do not get the unicodeMask 1079 * from the sharedData; instead, use the base table's unicodeMask, 1080 * which we copied in the memcpy above; 1081 * this is necessary because the static data unicodeMask, especially 1082 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1083 */ 1084 } else { 1085 /* conversion file with a base table; an additional extension table is optional */ 1086 /* make sure that the output type is known */ 1087 switch(mbcsTable->outputType) { 1088 case MBCS_OUTPUT_1: 1089 case MBCS_OUTPUT_2: 1090 case MBCS_OUTPUT_3: 1091 case MBCS_OUTPUT_4: 1092 case MBCS_OUTPUT_3_EUC: 1093 case MBCS_OUTPUT_4_EUC: 1094 case MBCS_OUTPUT_2_SISO: 1095 /* OK */ 1096 break; 1097 default: 1098 *pErrorCode=U_INVALID_TABLE_FORMAT; 1099 return; 1100 } 1101 1102 mbcsTable->countStates=(uint8_t)header->countStates; 1103 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1104 mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader)); 1105 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1106 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1107 1108 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1109 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1110 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1111 1112 /* 1113 * converter versions 6.1 and up contain a unicodeMask that is 1114 * used here to select the most efficient function implementations 1115 */ 1116 info.size=sizeof(UDataInfo); 1117 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1118 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1119 /* mask off possible future extensions to be safe */ 1120 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1121 } else { 1122 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1123 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1124 } 1125 1126 /* 1127 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1128 * Check for the header version, SBCS vs. MBCS, and for whether the 1129 * data structures are optimized for code points as high as what the 1130 * runtime code is designed for. 1131 * The implementation does not handle mapping tables with entries for 1132 * unpaired surrogates. 1133 */ 1134 if( header->version[1]>=3 && 1135 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1136 (mbcsTable->countStates==1 ? 1137 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1138 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1139 ) 1140 ) { 1141 mbcsTable->utf8Friendly=TRUE; 1142 1143 if(mbcsTable->countStates==1) { 1144 /* 1145 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1146 * Build a table with indexes to each block, to be used instead of 1147 * the regular stage 1/2 table. 1148 */ 1149 int32_t i; 1150 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1151 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1152 } 1153 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1154 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1155 } else { 1156 /* 1157 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1158 * The .cnv file is prebuilt with an additional stage table with indexes 1159 * to each block. 1160 */ 1161 mbcsTable->mbcsIndex=(const uint16_t *)(mbcsTable->fromUnicodeBytes+mbcsTable->fromUBytesLength); 1162 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1163 } 1164 } 1165 1166 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1167 { 1168 uint32_t asciiRoundtrips=0xffffffff; 1169 int32_t i; 1170 1171 for(i=0; i<0x80; ++i) { 1172 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1173 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1174 } 1175 } 1176 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1177 } 1178 } 1179 1180 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1181 if(mbcsTable->utf8Friendly) { 1182 if(mbcsTable->countStates==1) { 1183 sharedData->impl=&_SBCSUTF8Impl; 1184 } else { 1185 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1186 sharedData->impl=&_DBCSUTF8Impl; 1187 } 1188 } 1189 } 1190 1191 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1192 /* 1193 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1194 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1195 */ 1196 mbcsTable->asciiRoundtrips=0; 1197 } 1198} 1199 1200static void 1201ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1202 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1203 1204 if(mbcsTable->swapLFNLStateTable!=NULL) { 1205 uprv_free(mbcsTable->swapLFNLStateTable); 1206 } 1207 if(mbcsTable->stateTableOwned) { 1208 uprv_free((void *)mbcsTable->stateTable); 1209 } 1210 if(mbcsTable->baseSharedData!=NULL) { 1211 ucnv_unload(mbcsTable->baseSharedData); 1212 } 1213} 1214 1215static void 1216ucnv_MBCSOpen(UConverter *cnv, 1217 const char *name, 1218 const char *locale, 1219 uint32_t options, 1220 UErrorCode *pErrorCode) { 1221 UConverterMBCSTable *mbcsTable; 1222 const int32_t *extIndexes; 1223 uint8_t outputType; 1224 int8_t maxBytesPerUChar; 1225 1226 mbcsTable=&cnv->sharedData->mbcs; 1227 outputType=mbcsTable->outputType; 1228 1229 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1230 /* the swaplfnl option does not apply, remove it */ 1231 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL; 1232 } 1233 1234 if((options&UCNV_OPTION_SWAP_LFNL)!=0) { 1235 /* do this because double-checked locking is broken */ 1236 UBool isCached; 1237 1238 umtx_lock(NULL); 1239 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1240 umtx_unlock(NULL); 1241 1242 if(!isCached) { 1243 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1244 if(U_FAILURE(*pErrorCode)) { 1245 return; /* something went wrong */ 1246 } 1247 1248 /* the option does not apply, remove it */ 1249 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL; 1250 } 1251 } 1252 } 1253 1254 if(uprv_strstr(name, "18030")!=NULL) { 1255 if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) { 1256 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1257 cnv->options|=_MBCS_OPTION_GB18030; 1258 } 1259 } 1260 1261 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1262 if(outputType==MBCS_OUTPUT_2_SISO) { 1263 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1264 } 1265 1266 extIndexes=mbcsTable->extIndexes; 1267 if(extIndexes!=NULL) { 1268 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1269 if(outputType==MBCS_OUTPUT_2_SISO) { 1270 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1271 } 1272 1273 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1274 cnv->maxBytesPerUChar=maxBytesPerUChar; 1275 } 1276 } 1277 1278#if 0 1279 /* 1280 * documentation of UConverter fields used for status 1281 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1282 */ 1283 1284 /* toUnicode */ 1285 cnv->toUnicodeStatus=0; /* offset */ 1286 cnv->mode=0; /* state */ 1287 cnv->toULength=0; /* byteIndex */ 1288 1289 /* fromUnicode */ 1290 cnv->fromUChar32=0; 1291 cnv->fromUnicodeStatus=1; /* prevLength */ 1292#endif 1293} 1294 1295static const char * 1296ucnv_MBCSGetName(const UConverter *cnv) { 1297 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1298 return cnv->sharedData->mbcs.swapLFNLName; 1299 } else { 1300 return cnv->sharedData->staticData->name; 1301 } 1302} 1303 1304/* MBCS-to-Unicode conversion functions ------------------------------------- */ 1305 1306static UChar32 1307ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1308 const _MBCSToUFallback *toUFallbacks; 1309 uint32_t i, start, limit; 1310 1311 limit=mbcsTable->countToUFallbacks; 1312 if(limit>0) { 1313 /* do a binary search for the fallback mapping */ 1314 toUFallbacks=mbcsTable->toUFallbacks; 1315 start=0; 1316 while(start<limit-1) { 1317 i=(start+limit)/2; 1318 if(offset<toUFallbacks[i].offset) { 1319 limit=i; 1320 } else { 1321 start=i; 1322 } 1323 } 1324 1325 /* did we really find it? */ 1326 if(offset==toUFallbacks[start].offset) { 1327 return toUFallbacks[start].codePoint; 1328 } 1329 } 1330 1331 return 0xfffe; 1332} 1333 1334/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1335static void 1336ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1337 UErrorCode *pErrorCode) { 1338 UConverter *cnv; 1339 const uint8_t *source, *sourceLimit; 1340 UChar *target; 1341 const UChar *targetLimit; 1342 int32_t *offsets; 1343 1344 const int32_t (*stateTable)[256]; 1345 1346 int32_t sourceIndex; 1347 1348 int32_t entry; 1349 UChar c; 1350 uint8_t action; 1351 1352 /* set up the local pointers */ 1353 cnv=pArgs->converter; 1354 source=(const uint8_t *)pArgs->source; 1355 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1356 target=pArgs->target; 1357 targetLimit=pArgs->targetLimit; 1358 offsets=pArgs->offsets; 1359 1360 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1361 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1362 } else { 1363 stateTable=cnv->sharedData->mbcs.stateTable; 1364 } 1365 1366 /* sourceIndex=-1 if the current character began in the previous buffer */ 1367 sourceIndex=0; 1368 1369 /* conversion loop */ 1370 while(source<sourceLimit) { 1371 /* 1372 * This following test is to see if available input would overflow the output. 1373 * It does not catch output of more than one code unit that 1374 * overflows as a result of a surrogate pair or callback output 1375 * from the last source byte. 1376 * Therefore, those situations also test for overflows and will 1377 * then break the loop, too. 1378 */ 1379 if(target>=targetLimit) { 1380 /* target is full */ 1381 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1382 break; 1383 } 1384 1385 entry=stateTable[0][*source++]; 1386 /* MBCS_ENTRY_IS_FINAL(entry) */ 1387 1388 /* test the most common case first */ 1389 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1390 /* output BMP code point */ 1391 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1392 if(offsets!=NULL) { 1393 *offsets++=sourceIndex; 1394 } 1395 1396 /* normal end of action codes: prepare for a new character */ 1397 ++sourceIndex; 1398 continue; 1399 } 1400 1401 /* 1402 * An if-else-if chain provides more reliable performance for 1403 * the most common cases compared to a switch. 1404 */ 1405 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1406 if(action==MBCS_STATE_VALID_DIRECT_20 || 1407 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1408 ) { 1409 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1410 /* output surrogate pair */ 1411 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1412 if(offsets!=NULL) { 1413 *offsets++=sourceIndex; 1414 } 1415 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1416 if(target<targetLimit) { 1417 *target++=c; 1418 if(offsets!=NULL) { 1419 *offsets++=sourceIndex; 1420 } 1421 } else { 1422 /* target overflow */ 1423 cnv->UCharErrorBuffer[0]=c; 1424 cnv->UCharErrorBufferLength=1; 1425 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1426 break; 1427 } 1428 1429 ++sourceIndex; 1430 continue; 1431 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1432 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1433 /* output BMP code point */ 1434 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1435 if(offsets!=NULL) { 1436 *offsets++=sourceIndex; 1437 } 1438 1439 ++sourceIndex; 1440 continue; 1441 } 1442 } else if(action==MBCS_STATE_UNASSIGNED) { 1443 /* just fall through */ 1444 } else if(action==MBCS_STATE_ILLEGAL) { 1445 /* callback(illegal) */ 1446 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1447 } else { 1448 /* reserved, must never occur */ 1449 ++sourceIndex; 1450 continue; 1451 } 1452 1453 if(U_FAILURE(*pErrorCode)) { 1454 /* callback(illegal) */ 1455 break; 1456 } else /* unassigned sequences indicated with byteIndex>0 */ { 1457 /* try an extension mapping */ 1458 pArgs->source=(const char *)source; 1459 cnv->toUBytes[0]=*(source-1); 1460 cnv->toULength=_extToU(cnv, cnv->sharedData, 1461 1, &source, sourceLimit, 1462 &target, targetLimit, 1463 &offsets, sourceIndex, 1464 pArgs->flush, 1465 pErrorCode); 1466 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 1467 1468 if(U_FAILURE(*pErrorCode)) { 1469 /* not mappable or buffer overflow */ 1470 break; 1471 } 1472 } 1473 } 1474 1475 /* write back the updated pointers */ 1476 pArgs->source=(const char *)source; 1477 pArgs->target=target; 1478 pArgs->offsets=offsets; 1479} 1480 1481/* 1482 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 1483 * that only map to and from the BMP. 1484 * In addition to single-byte optimizations, the offset calculations 1485 * become much easier. 1486 */ 1487static void 1488ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 1489 UErrorCode *pErrorCode) { 1490 UConverter *cnv; 1491 const uint8_t *source, *sourceLimit, *lastSource; 1492 UChar *target; 1493 int32_t targetCapacity, length; 1494 int32_t *offsets; 1495 1496 const int32_t (*stateTable)[256]; 1497 1498 int32_t sourceIndex; 1499 1500 int32_t entry; 1501 uint8_t action; 1502 1503 /* set up the local pointers */ 1504 cnv=pArgs->converter; 1505 source=(const uint8_t *)pArgs->source; 1506 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1507 target=pArgs->target; 1508 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1509 offsets=pArgs->offsets; 1510 1511 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1512 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1513 } else { 1514 stateTable=cnv->sharedData->mbcs.stateTable; 1515 } 1516 1517 /* sourceIndex=-1 if the current character began in the previous buffer */ 1518 sourceIndex=0; 1519 lastSource=source; 1520 1521 /* 1522 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1523 * for the minimum of the sourceLength and targetCapacity 1524 */ 1525 length=(int32_t)(sourceLimit-source); 1526 if(length<targetCapacity) { 1527 targetCapacity=length; 1528 } 1529 1530#if MBCS_UNROLL_SINGLE_TO_BMP 1531 /* unrolling makes it faster on Pentium III/Windows 2000 */ 1532 /* unroll the loop with the most common case */ 1533unrolled: 1534 if(targetCapacity>=16) { 1535 int32_t count, loops, oredEntries; 1536 1537 loops=count=targetCapacity>>4; 1538 do { 1539 oredEntries=entry=stateTable[0][*source++]; 1540 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1541 oredEntries|=entry=stateTable[0][*source++]; 1542 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1543 oredEntries|=entry=stateTable[0][*source++]; 1544 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1545 oredEntries|=entry=stateTable[0][*source++]; 1546 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1547 oredEntries|=entry=stateTable[0][*source++]; 1548 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1549 oredEntries|=entry=stateTable[0][*source++]; 1550 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1551 oredEntries|=entry=stateTable[0][*source++]; 1552 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1553 oredEntries|=entry=stateTable[0][*source++]; 1554 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1555 oredEntries|=entry=stateTable[0][*source++]; 1556 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1557 oredEntries|=entry=stateTable[0][*source++]; 1558 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1559 oredEntries|=entry=stateTable[0][*source++]; 1560 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1561 oredEntries|=entry=stateTable[0][*source++]; 1562 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1563 oredEntries|=entry=stateTable[0][*source++]; 1564 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1565 oredEntries|=entry=stateTable[0][*source++]; 1566 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1567 oredEntries|=entry=stateTable[0][*source++]; 1568 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1569 oredEntries|=entry=stateTable[0][*source++]; 1570 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1571 1572 /* were all 16 entries really valid? */ 1573 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 1574 /* no, return to the first of these 16 */ 1575 source-=16; 1576 target-=16; 1577 break; 1578 } 1579 } while(--count>0); 1580 count=loops-count; 1581 targetCapacity-=16*count; 1582 1583 if(offsets!=NULL) { 1584 lastSource+=16*count; 1585 while(count>0) { 1586 *offsets++=sourceIndex++; 1587 *offsets++=sourceIndex++; 1588 *offsets++=sourceIndex++; 1589 *offsets++=sourceIndex++; 1590 *offsets++=sourceIndex++; 1591 *offsets++=sourceIndex++; 1592 *offsets++=sourceIndex++; 1593 *offsets++=sourceIndex++; 1594 *offsets++=sourceIndex++; 1595 *offsets++=sourceIndex++; 1596 *offsets++=sourceIndex++; 1597 *offsets++=sourceIndex++; 1598 *offsets++=sourceIndex++; 1599 *offsets++=sourceIndex++; 1600 *offsets++=sourceIndex++; 1601 *offsets++=sourceIndex++; 1602 --count; 1603 } 1604 } 1605 } 1606#endif 1607 1608 /* conversion loop */ 1609 while(targetCapacity>0) { 1610 entry=stateTable[0][*source++]; 1611 /* MBCS_ENTRY_IS_FINAL(entry) */ 1612 1613 /* test the most common case first */ 1614 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1615 /* output BMP code point */ 1616 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1617 --targetCapacity; 1618 continue; 1619 } 1620 1621 /* 1622 * An if-else-if chain provides more reliable performance for 1623 * the most common cases compared to a switch. 1624 */ 1625 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1626 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1627 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1628 /* output BMP code point */ 1629 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1630 --targetCapacity; 1631 continue; 1632 } 1633 } else if(action==MBCS_STATE_UNASSIGNED) { 1634 /* just fall through */ 1635 } else if(action==MBCS_STATE_ILLEGAL) { 1636 /* callback(illegal) */ 1637 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1638 } else { 1639 /* reserved, must never occur */ 1640 continue; 1641 } 1642 1643 /* set offsets since the start or the last extension */ 1644 if(offsets!=NULL) { 1645 int32_t count=(int32_t)(source-lastSource); 1646 1647 /* predecrement: do not set the offset for the callback-causing character */ 1648 while(--count>0) { 1649 *offsets++=sourceIndex++; 1650 } 1651 /* offset and sourceIndex are now set for the current character */ 1652 } 1653 1654 if(U_FAILURE(*pErrorCode)) { 1655 /* callback(illegal) */ 1656 break; 1657 } else /* unassigned sequences indicated with byteIndex>0 */ { 1658 /* try an extension mapping */ 1659 lastSource=source; 1660 cnv->toUBytes[0]=*(source-1); 1661 cnv->toULength=_extToU(cnv, cnv->sharedData, 1662 1, &source, sourceLimit, 1663 &target, target+targetCapacity, 1664 &offsets, sourceIndex, 1665 pArgs->flush, 1666 pErrorCode); 1667 sourceIndex+=1+(int32_t)(source-lastSource); 1668 1669 if(U_FAILURE(*pErrorCode)) { 1670 /* not mappable or buffer overflow */ 1671 break; 1672 } 1673 1674 /* recalculate the targetCapacity after an extension mapping */ 1675 targetCapacity=(int32_t)(pArgs->targetLimit-target); 1676 length=(int32_t)(sourceLimit-source); 1677 if(length<targetCapacity) { 1678 targetCapacity=length; 1679 } 1680 } 1681 1682#if MBCS_UNROLL_SINGLE_TO_BMP 1683 /* unrolling makes it faster on Pentium III/Windows 2000 */ 1684 goto unrolled; 1685#endif 1686 } 1687 1688 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 1689 /* target is full */ 1690 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1691 } 1692 1693 /* set offsets since the start or the last callback */ 1694 if(offsets!=NULL) { 1695 size_t count=source-lastSource; 1696 while(count>0) { 1697 *offsets++=sourceIndex++; 1698 --count; 1699 } 1700 } 1701 1702 /* write back the updated pointers */ 1703 pArgs->source=(const char *)source; 1704 pArgs->target=target; 1705 pArgs->offsets=offsets; 1706} 1707 1708U_CFUNC void 1709ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1710 UErrorCode *pErrorCode) { 1711 UConverter *cnv; 1712 const uint8_t *source, *sourceLimit; 1713 UChar *target; 1714 const UChar *targetLimit; 1715 int32_t *offsets; 1716 1717 const int32_t (*stateTable)[256]; 1718 const uint16_t *unicodeCodeUnits; 1719 1720 uint32_t offset; 1721 uint8_t state; 1722 int8_t byteIndex; 1723 uint8_t *bytes; 1724 1725 int32_t sourceIndex, nextSourceIndex; 1726 1727 int32_t entry; 1728 UChar c; 1729 uint8_t action; 1730 1731 /* use optimized function if possible */ 1732 cnv=pArgs->converter; 1733 1734 if(cnv->preToULength>0) { 1735 /* 1736 * pass sourceIndex=-1 because we continue from an earlier buffer 1737 * in the future, this may change with continuous offsets 1738 */ 1739 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 1740 1741 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 1742 return; 1743 } 1744 } 1745 1746 if(cnv->sharedData->mbcs.countStates==1) { 1747 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1748 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 1749 } else { 1750 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 1751 } 1752 return; 1753 } 1754 1755 /* set up the local pointers */ 1756 source=(const uint8_t *)pArgs->source; 1757 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1758 target=pArgs->target; 1759 targetLimit=pArgs->targetLimit; 1760 offsets=pArgs->offsets; 1761 1762 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1763 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1764 } else { 1765 stateTable=cnv->sharedData->mbcs.stateTable; 1766 } 1767 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 1768 1769 /* get the converter state from UConverter */ 1770 offset=cnv->toUnicodeStatus; 1771 byteIndex=cnv->toULength; 1772 bytes=cnv->toUBytes; 1773 1774 /* 1775 * if we are in the SBCS state for a DBCS-only converter, 1776 * then load the DBCS state from the MBCS data 1777 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 1778 */ 1779 if((state=(uint8_t)(cnv->mode))==0) { 1780 state=cnv->sharedData->mbcs.dbcsOnlyState; 1781 } 1782 1783 /* sourceIndex=-1 if the current character began in the previous buffer */ 1784 sourceIndex=byteIndex==0 ? 0 : -1; 1785 nextSourceIndex=0; 1786 1787 /* conversion loop */ 1788 while(source<sourceLimit) { 1789 /* 1790 * This following test is to see if available input would overflow the output. 1791 * It does not catch output of more than one code unit that 1792 * overflows as a result of a surrogate pair or callback output 1793 * from the last source byte. 1794 * Therefore, those situations also test for overflows and will 1795 * then break the loop, too. 1796 */ 1797 if(target>=targetLimit) { 1798 /* target is full */ 1799 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1800 break; 1801 } 1802 1803 if(byteIndex==0) { 1804 /* optimized loop for 1/2-byte input and BMP output */ 1805 if(offsets==NULL) { 1806 do { 1807 entry=stateTable[state][*source]; 1808 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1809 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1810 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1811 1812 ++source; 1813 if( source<sourceLimit && 1814 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 1815 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 1816 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 1817 ) { 1818 ++source; 1819 *target++=c; 1820 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1821 offset=0; 1822 } else { 1823 /* set the state and leave the optimized loop */ 1824 bytes[0]=*(source-1); 1825 byteIndex=1; 1826 break; 1827 } 1828 } else { 1829 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1830 /* output BMP code point */ 1831 ++source; 1832 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1833 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1834 } else { 1835 /* leave the optimized loop */ 1836 break; 1837 } 1838 } 1839 } while(source<sourceLimit && target<targetLimit); 1840 } else /* offsets!=NULL */ { 1841 do { 1842 entry=stateTable[state][*source]; 1843 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1844 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1845 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1846 1847 ++source; 1848 if( source<sourceLimit && 1849 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 1850 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 1851 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 1852 ) { 1853 ++source; 1854 *target++=c; 1855 if(offsets!=NULL) { 1856 *offsets++=sourceIndex; 1857 sourceIndex=(nextSourceIndex+=2); 1858 } 1859 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1860 offset=0; 1861 } else { 1862 /* set the state and leave the optimized loop */ 1863 ++nextSourceIndex; 1864 bytes[0]=*(source-1); 1865 byteIndex=1; 1866 break; 1867 } 1868 } else { 1869 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1870 /* output BMP code point */ 1871 ++source; 1872 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1873 if(offsets!=NULL) { 1874 *offsets++=sourceIndex; 1875 sourceIndex=++nextSourceIndex; 1876 } 1877 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1878 } else { 1879 /* leave the optimized loop */ 1880 break; 1881 } 1882 } 1883 } while(source<sourceLimit && target<targetLimit); 1884 } 1885 1886 /* 1887 * these tests and break statements could be put inside the loop 1888 * if C had "break outerLoop" like Java 1889 */ 1890 if(source>=sourceLimit) { 1891 break; 1892 } 1893 if(target>=targetLimit) { 1894 /* target is full */ 1895 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1896 break; 1897 } 1898 1899 ++nextSourceIndex; 1900 bytes[byteIndex++]=*source++; 1901 } else /* byteIndex>0 */ { 1902 ++nextSourceIndex; 1903 entry=stateTable[state][bytes[byteIndex++]=*source++]; 1904 } 1905 1906 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1907 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1908 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1909 continue; 1910 } 1911 1912 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 1913 cnv->mode=state; 1914 1915 /* set the next state early so that we can reuse the entry variable */ 1916 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1917 1918 /* 1919 * An if-else-if chain provides more reliable performance for 1920 * the most common cases compared to a switch. 1921 */ 1922 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1923 if(action==MBCS_STATE_VALID_16) { 1924 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1925 c=unicodeCodeUnits[offset]; 1926 if(c<0xfffe) { 1927 /* output BMP code point */ 1928 *target++=c; 1929 if(offsets!=NULL) { 1930 *offsets++=sourceIndex; 1931 } 1932 byteIndex=0; 1933 } else if(c==0xfffe) { 1934 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 1935 /* output fallback BMP code point */ 1936 *target++=(UChar)entry; 1937 if(offsets!=NULL) { 1938 *offsets++=sourceIndex; 1939 } 1940 byteIndex=0; 1941 } 1942 } else { 1943 /* callback(illegal) */ 1944 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1945 } 1946 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 1947 /* output BMP code point */ 1948 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1949 if(offsets!=NULL) { 1950 *offsets++=sourceIndex; 1951 } 1952 byteIndex=0; 1953 } else if(action==MBCS_STATE_VALID_16_PAIR) { 1954 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1955 c=unicodeCodeUnits[offset++]; 1956 if(c<0xd800) { 1957 /* output BMP code point below 0xd800 */ 1958 *target++=c; 1959 if(offsets!=NULL) { 1960 *offsets++=sourceIndex; 1961 } 1962 byteIndex=0; 1963 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 1964 /* output roundtrip or fallback surrogate pair */ 1965 *target++=(UChar)(c&0xdbff); 1966 if(offsets!=NULL) { 1967 *offsets++=sourceIndex; 1968 } 1969 byteIndex=0; 1970 if(target<targetLimit) { 1971 *target++=unicodeCodeUnits[offset]; 1972 if(offsets!=NULL) { 1973 *offsets++=sourceIndex; 1974 } 1975 } else { 1976 /* target overflow */ 1977 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 1978 cnv->UCharErrorBufferLength=1; 1979 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1980 1981 offset=0; 1982 break; 1983 } 1984 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 1985 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 1986 *target++=unicodeCodeUnits[offset]; 1987 if(offsets!=NULL) { 1988 *offsets++=sourceIndex; 1989 } 1990 byteIndex=0; 1991 } else if(c==0xffff) { 1992 /* callback(illegal) */ 1993 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1994 } 1995 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 1996 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1997 ) { 1998 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1999 /* output surrogate pair */ 2000 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2001 if(offsets!=NULL) { 2002 *offsets++=sourceIndex; 2003 } 2004 byteIndex=0; 2005 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2006 if(target<targetLimit) { 2007 *target++=c; 2008 if(offsets!=NULL) { 2009 *offsets++=sourceIndex; 2010 } 2011 } else { 2012 /* target overflow */ 2013 cnv->UCharErrorBuffer[0]=c; 2014 cnv->UCharErrorBufferLength=1; 2015 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2016 2017 offset=0; 2018 break; 2019 } 2020 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2021 /* 2022 * This serves as a state change without any output. 2023 * It is useful for reading simple stateful encodings, 2024 * for example using just Shift-In/Shift-Out codes. 2025 * The 21 unused bits may later be used for more sophisticated 2026 * state transitions. 2027 */ 2028 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2029 byteIndex=0; 2030 } else { 2031 /* SI/SO are illegal for DBCS-only conversion */ 2032 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2033 2034 /* callback(illegal) */ 2035 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2036 } 2037 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2038 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2039 /* output BMP code point */ 2040 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2041 if(offsets!=NULL) { 2042 *offsets++=sourceIndex; 2043 } 2044 byteIndex=0; 2045 } 2046 } else if(action==MBCS_STATE_UNASSIGNED) { 2047 /* just fall through */ 2048 } else if(action==MBCS_STATE_ILLEGAL) { 2049 /* callback(illegal) */ 2050 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2051 } else { 2052 /* reserved, must never occur */ 2053 byteIndex=0; 2054 } 2055 2056 /* end of action codes: prepare for a new character */ 2057 offset=0; 2058 2059 if(byteIndex==0) { 2060 sourceIndex=nextSourceIndex; 2061 } else if(U_FAILURE(*pErrorCode)) { 2062 /* callback(illegal) */ 2063 break; 2064 } else /* unassigned sequences indicated with byteIndex>0 */ { 2065 /* try an extension mapping */ 2066 pArgs->source=(const char *)source; 2067 byteIndex=_extToU(cnv, cnv->sharedData, 2068 byteIndex, &source, sourceLimit, 2069 &target, targetLimit, 2070 &offsets, sourceIndex, 2071 pArgs->flush, 2072 pErrorCode); 2073 sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); 2074 2075 if(U_FAILURE(*pErrorCode)) { 2076 /* not mappable or buffer overflow */ 2077 break; 2078 } 2079 } 2080 } 2081 2082 /* set the converter state back into UConverter */ 2083 cnv->toUnicodeStatus=offset; 2084 cnv->mode=state; 2085 cnv->toULength=byteIndex; 2086 2087 /* write back the updated pointers */ 2088 pArgs->source=(const char *)source; 2089 pArgs->target=target; 2090 pArgs->offsets=offsets; 2091} 2092 2093/* 2094 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2095 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2096 */ 2097static UChar32 2098ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2099 UErrorCode *pErrorCode) { 2100 UConverter *cnv; 2101 const int32_t (*stateTable)[256]; 2102 const uint8_t *source, *sourceLimit; 2103 2104 int32_t entry; 2105 uint8_t action; 2106 2107 /* set up the local pointers */ 2108 cnv=pArgs->converter; 2109 source=(const uint8_t *)pArgs->source; 2110 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2111 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2112 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2113 } else { 2114 stateTable=cnv->sharedData->mbcs.stateTable; 2115 } 2116 2117 /* conversion loop */ 2118 while(source<sourceLimit) { 2119 entry=stateTable[0][*source++]; 2120 /* MBCS_ENTRY_IS_FINAL(entry) */ 2121 2122 /* write back the updated pointer early so that we can return directly */ 2123 pArgs->source=(const char *)source; 2124 2125 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2126 /* output BMP code point */ 2127 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2128 } 2129 2130 /* 2131 * An if-else-if chain provides more reliable performance for 2132 * the most common cases compared to a switch. 2133 */ 2134 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2135 if( action==MBCS_STATE_VALID_DIRECT_20 || 2136 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2137 ) { 2138 /* output supplementary code point */ 2139 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2140 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2141 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2142 /* output BMP code point */ 2143 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2144 } 2145 } else if(action==MBCS_STATE_UNASSIGNED) { 2146 /* just fall through */ 2147 } else if(action==MBCS_STATE_ILLEGAL) { 2148 /* callback(illegal) */ 2149 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2150 } else { 2151 /* reserved, must never occur */ 2152 continue; 2153 } 2154 2155 if(U_FAILURE(*pErrorCode)) { 2156 /* callback(illegal) */ 2157 break; 2158 } else /* unassigned sequence */ { 2159 /* defer to the generic implementation */ 2160 pArgs->source=(const char *)source-1; 2161 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2162 } 2163 } 2164 2165 /* no output because of empty input or only state changes */ 2166 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2167 return 0xffff; 2168} 2169 2170/* 2171 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2172 * conversion without offset handling. 2173 * 2174 * When a character does not have a mapping to Unicode, then we return to the 2175 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2176 * handling. 2177 * We also defer to the generic code in other complicated cases and have them 2178 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2179 * 2180 * All normal mappings and errors are handled here. 2181 */ 2182static UChar32 2183ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2184 UErrorCode *pErrorCode) { 2185 UConverter *cnv; 2186 const uint8_t *source, *sourceLimit, *lastSource; 2187 2188 const int32_t (*stateTable)[256]; 2189 const uint16_t *unicodeCodeUnits; 2190 2191 uint32_t offset; 2192 uint8_t state; 2193 2194 int32_t entry; 2195 UChar32 c; 2196 uint8_t action; 2197 2198 /* use optimized function if possible */ 2199 cnv=pArgs->converter; 2200 2201 if(cnv->preToULength>0) { 2202 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2203 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2204 } 2205 2206 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2207 /* 2208 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2209 * with the rare case of a codepage that maps single surrogates 2210 * without adding the complexity to this already complicated function here. 2211 */ 2212 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2213 } else if(cnv->sharedData->mbcs.countStates==1) { 2214 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2215 } 2216 2217 /* set up the local pointers */ 2218 source=lastSource=(const uint8_t *)pArgs->source; 2219 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2220 2221 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2222 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2223 } else { 2224 stateTable=cnv->sharedData->mbcs.stateTable; 2225 } 2226 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2227 2228 /* get the converter state from UConverter */ 2229 offset=cnv->toUnicodeStatus; 2230 2231 /* 2232 * if we are in the SBCS state for a DBCS-only converter, 2233 * then load the DBCS state from the MBCS data 2234 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2235 */ 2236 if((state=(uint8_t)(cnv->mode))==0) { 2237 state=cnv->sharedData->mbcs.dbcsOnlyState; 2238 } 2239 2240 /* conversion loop */ 2241 c=U_SENTINEL; 2242 while(source<sourceLimit) { 2243 entry=stateTable[state][*source++]; 2244 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2245 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2246 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2247 2248 /* optimization for 1/2-byte input and BMP output */ 2249 if( source<sourceLimit && 2250 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2251 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2252 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2253 ) { 2254 ++source; 2255 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2256 /* output BMP code point */ 2257 break; 2258 } 2259 } else { 2260 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2261 cnv->mode=state; 2262 2263 /* set the next state early so that we can reuse the entry variable */ 2264 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2265 2266 /* 2267 * An if-else-if chain provides more reliable performance for 2268 * the most common cases compared to a switch. 2269 */ 2270 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2271 if(action==MBCS_STATE_VALID_DIRECT_16) { 2272 /* output BMP code point */ 2273 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2274 break; 2275 } else if(action==MBCS_STATE_VALID_16) { 2276 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2277 c=unicodeCodeUnits[offset]; 2278 if(c<0xfffe) { 2279 /* output BMP code point */ 2280 break; 2281 } else if(c==0xfffe) { 2282 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2283 break; 2284 } 2285 } else { 2286 /* callback(illegal) */ 2287 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2288 } 2289 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2290 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2291 c=unicodeCodeUnits[offset++]; 2292 if(c<0xd800) { 2293 /* output BMP code point below 0xd800 */ 2294 break; 2295 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2296 /* output roundtrip or fallback supplementary code point */ 2297 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 2298 break; 2299 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2300 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2301 c=unicodeCodeUnits[offset]; 2302 break; 2303 } else if(c==0xffff) { 2304 /* callback(illegal) */ 2305 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2306 } 2307 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2308 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2309 ) { 2310 /* output supplementary code point */ 2311 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2312 break; 2313 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2314 /* 2315 * This serves as a state change without any output. 2316 * It is useful for reading simple stateful encodings, 2317 * for example using just Shift-In/Shift-Out codes. 2318 * The 21 unused bits may later be used for more sophisticated 2319 * state transitions. 2320 */ 2321 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 2322 /* SI/SO are illegal for DBCS-only conversion */ 2323 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2324 2325 /* callback(illegal) */ 2326 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2327 } 2328 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2329 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2330 /* output BMP code point */ 2331 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2332 break; 2333 } 2334 } else if(action==MBCS_STATE_UNASSIGNED) { 2335 /* just fall through */ 2336 } else if(action==MBCS_STATE_ILLEGAL) { 2337 /* callback(illegal) */ 2338 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2339 } else { 2340 /* reserved (must never occur), or only state change */ 2341 offset=0; 2342 lastSource=source; 2343 continue; 2344 } 2345 2346 /* end of action codes: prepare for a new character */ 2347 offset=0; 2348 2349 if(U_FAILURE(*pErrorCode)) { 2350 /* callback(illegal) */ 2351 break; 2352 } else /* unassigned sequence */ { 2353 /* defer to the generic implementation */ 2354 cnv->toUnicodeStatus=0; 2355 cnv->mode=state; 2356 pArgs->source=(const char *)lastSource; 2357 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2358 } 2359 } 2360 } 2361 2362 if(c<0) { 2363 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 2364 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 2365 } 2366 if(U_FAILURE(*pErrorCode)) { 2367 /* incomplete character byte sequence */ 2368 uint8_t *bytes=cnv->toUBytes; 2369 cnv->toULength=(int8_t)(source-lastSource); 2370 do { 2371 *bytes++=*lastSource++; 2372 } while(lastSource<source); 2373 } else { 2374 /* no output because of empty input or only state changes */ 2375 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2376 } 2377 c=0xffff; 2378 } 2379 2380 /* set the converter state back into UConverter, ready for a new character */ 2381 cnv->toUnicodeStatus=0; 2382 cnv->mode=state; 2383 2384 /* write back the updated pointer */ 2385 pArgs->source=(const char *)source; 2386 return c; 2387} 2388 2389#if 0 2390/* 2391 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 2392 * Removal improves code coverage. 2393 */ 2394/** 2395 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 2396 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 2397 * It does not handle conversion extensions (_extToU()). 2398 */ 2399U_CFUNC UChar32 2400ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 2401 uint8_t b, UBool useFallback) { 2402 int32_t entry; 2403 uint8_t action; 2404 2405 entry=sharedData->mbcs.stateTable[0][b]; 2406 /* MBCS_ENTRY_IS_FINAL(entry) */ 2407 2408 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2409 /* output BMP code point */ 2410 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2411 } 2412 2413 /* 2414 * An if-else-if chain provides more reliable performance for 2415 * the most common cases compared to a switch. 2416 */ 2417 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2418 if(action==MBCS_STATE_VALID_DIRECT_20) { 2419 /* output supplementary code point */ 2420 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 2421 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2422 if(!TO_U_USE_FALLBACK(useFallback)) { 2423 return 0xfffe; 2424 } 2425 /* output BMP code point */ 2426 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2427 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 2428 if(!TO_U_USE_FALLBACK(useFallback)) { 2429 return 0xfffe; 2430 } 2431 /* output supplementary code point */ 2432 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 2433 } else if(action==MBCS_STATE_UNASSIGNED) { 2434 return 0xfffe; 2435 } else if(action==MBCS_STATE_ILLEGAL) { 2436 return 0xffff; 2437 } else { 2438 /* reserved, must never occur */ 2439 return 0xffff; 2440 } 2441} 2442#endif 2443 2444/* 2445 * This is a simple version of _MBCSGetNextUChar() that is used 2446 * by other converter implementations. 2447 * It only returns an "assigned" result if it consumes the entire input. 2448 * It does not use state from the converter, nor error codes. 2449 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 2450 * It handles conversion extensions but not GB 18030. 2451 * 2452 * Return value: 2453 * U+fffe unassigned 2454 * U+ffff illegal 2455 * otherwise the Unicode code point 2456 */ 2457U_CFUNC UChar32 2458ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 2459 const char *source, int32_t length, 2460 UBool useFallback) { 2461 const int32_t (*stateTable)[256]; 2462 const uint16_t *unicodeCodeUnits; 2463 2464 uint32_t offset; 2465 uint8_t state, action; 2466 2467 UChar32 c; 2468 int32_t i, entry; 2469 2470 if(length<=0) { 2471 /* no input at all: "illegal" */ 2472 return 0xffff; 2473 } 2474 2475#if 0 2476/* 2477 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 2478 * TODO In future releases, verify that this function is never called for SBCS 2479 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 2480 * Removal improves code coverage. 2481 */ 2482 /* use optimized function if possible */ 2483 if(sharedData->mbcs.countStates==1) { 2484 if(length==1) { 2485 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 2486 } else { 2487 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 2488 } 2489 } 2490#endif 2491 2492 /* set up the local pointers */ 2493 stateTable=sharedData->mbcs.stateTable; 2494 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 2495 2496 /* converter state */ 2497 offset=0; 2498 state=sharedData->mbcs.dbcsOnlyState; 2499 2500 /* conversion loop */ 2501 for(i=0;;) { 2502 entry=stateTable[state][(uint8_t)source[i++]]; 2503 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2504 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2505 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2506 2507 if(i==length) { 2508 return 0xffff; /* truncated character */ 2509 } 2510 } else { 2511 /* 2512 * An if-else-if chain provides more reliable performance for 2513 * the most common cases compared to a switch. 2514 */ 2515 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2516 if(action==MBCS_STATE_VALID_16) { 2517 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2518 c=unicodeCodeUnits[offset]; 2519 if(c!=0xfffe) { 2520 /* done */ 2521 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2522 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 2523 /* else done with 0xfffe */ 2524 } 2525 break; 2526 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2527 /* output BMP code point */ 2528 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2529 break; 2530 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2531 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2532 c=unicodeCodeUnits[offset++]; 2533 if(c<0xd800) { 2534 /* output BMP code point below 0xd800 */ 2535 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2536 /* output roundtrip or fallback supplementary code point */ 2537 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 2538 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2539 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2540 c=unicodeCodeUnits[offset]; 2541 } else if(c==0xffff) { 2542 return 0xffff; 2543 } else { 2544 c=0xfffe; 2545 } 2546 break; 2547 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 2548 /* output supplementary code point */ 2549 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 2550 break; 2551 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2552 if(!TO_U_USE_FALLBACK(useFallback)) { 2553 c=0xfffe; 2554 break; 2555 } 2556 /* output BMP code point */ 2557 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2558 break; 2559 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 2560 if(!TO_U_USE_FALLBACK(useFallback)) { 2561 c=0xfffe; 2562 break; 2563 } 2564 /* output supplementary code point */ 2565 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 2566 break; 2567 } else if(action==MBCS_STATE_UNASSIGNED) { 2568 c=0xfffe; 2569 break; 2570 } 2571 2572 /* 2573 * forbid MBCS_STATE_CHANGE_ONLY for this function, 2574 * and MBCS_STATE_ILLEGAL and reserved action codes 2575 */ 2576 return 0xffff; 2577 } 2578 } 2579 2580 if(i!=length) { 2581 /* illegal for this function: not all input consumed */ 2582 return 0xffff; 2583 } 2584 2585 if(c==0xfffe) { 2586 /* try an extension mapping */ 2587 const int32_t *cx=sharedData->mbcs.extIndexes; 2588 if(cx!=NULL) { 2589 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 2590 } 2591 } 2592 2593 return c; 2594} 2595 2596/* MBCS-from-Unicode conversion functions ----------------------------------- */ 2597 2598/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 2599static void 2600ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 2601 UErrorCode *pErrorCode) { 2602 UConverter *cnv; 2603 const UChar *source, *sourceLimit; 2604 uint8_t *target; 2605 int32_t targetCapacity; 2606 int32_t *offsets; 2607 2608 const uint16_t *table; 2609 const uint16_t *mbcsIndex; 2610 const uint8_t *bytes; 2611 2612 UChar32 c; 2613 2614 int32_t sourceIndex, nextSourceIndex; 2615 2616 uint32_t stage2Entry; 2617 uint32_t asciiRoundtrips; 2618 uint32_t value; 2619 uint8_t unicodeMask; 2620 2621 /* use optimized function if possible */ 2622 cnv=pArgs->converter; 2623 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 2624 2625 /* set up the local pointers */ 2626 source=pArgs->source; 2627 sourceLimit=pArgs->sourceLimit; 2628 target=(uint8_t *)pArgs->target; 2629 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2630 offsets=pArgs->offsets; 2631 2632 table=cnv->sharedData->mbcs.fromUnicodeTable; 2633 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 2634 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2635 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 2636 } else { 2637 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 2638 } 2639 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 2640 2641 /* get the converter state from UConverter */ 2642 c=cnv->fromUChar32; 2643 2644 /* sourceIndex=-1 if the current character began in the previous buffer */ 2645 sourceIndex= c==0 ? 0 : -1; 2646 nextSourceIndex=0; 2647 2648 /* conversion loop */ 2649 if(c!=0 && targetCapacity>0) { 2650 goto getTrail; 2651 } 2652 2653 while(source<sourceLimit) { 2654 /* 2655 * This following test is to see if available input would overflow the output. 2656 * It does not catch output of more than one byte that 2657 * overflows as a result of a multi-byte character or callback output 2658 * from the last source character. 2659 * Therefore, those situations also test for overflows and will 2660 * then break the loop, too. 2661 */ 2662 if(targetCapacity>0) { 2663 /* 2664 * Get a correct Unicode code point: 2665 * a single UChar for a BMP code point or 2666 * a matched surrogate pair for a "supplementary code point". 2667 */ 2668 c=*source++; 2669 ++nextSourceIndex; 2670 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 2671 *target++=(uint8_t)c; 2672 if(offsets!=NULL) { 2673 *offsets++=sourceIndex; 2674 sourceIndex=nextSourceIndex; 2675 } 2676 --targetCapacity; 2677 c=0; 2678 continue; 2679 } 2680 /* 2681 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 2682 * to avoid dealing with surrogates. 2683 * MBCS_FAST_MAX must be >=0xd7ff. 2684 */ 2685 if(c<=0xd7ff) { 2686 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 2687 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 2688 if(value==0) { 2689 goto unassigned; 2690 } 2691 /* output the value */ 2692 } else { 2693 /* 2694 * This also tests if the codepage maps single surrogates. 2695 * If it does, then surrogates are not paired but mapped separately. 2696 * Note that in this case unmatched surrogates are not detected. 2697 */ 2698 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 2699 if(UTF_IS_SURROGATE_FIRST(c)) { 2700getTrail: 2701 if(source<sourceLimit) { 2702 /* test the following code unit */ 2703 UChar trail=*source; 2704 if(UTF_IS_SECOND_SURROGATE(trail)) { 2705 ++source; 2706 ++nextSourceIndex; 2707 c=UTF16_GET_PAIR_VALUE(c, trail); 2708 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2709 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 2710 /* callback(unassigned) */ 2711 goto unassigned; 2712 } 2713 /* convert this supplementary code point */ 2714 /* exit this condition tree */ 2715 } else { 2716 /* this is an unmatched lead code unit (1st surrogate) */ 2717 /* callback(illegal) */ 2718 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2719 break; 2720 } 2721 } else { 2722 /* no more input */ 2723 break; 2724 } 2725 } else { 2726 /* this is an unmatched trail code unit (2nd surrogate) */ 2727 /* callback(illegal) */ 2728 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2729 break; 2730 } 2731 } 2732 2733 /* convert the Unicode code point in c into codepage bytes */ 2734 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 2735 2736 /* get the bytes and the length for the output */ 2737 /* MBCS_OUTPUT_2 */ 2738 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 2739 2740 /* is this code point assigned, or do we use fallbacks? */ 2741 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 2742 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 2743 ) { 2744 /* 2745 * We allow a 0 byte output if the "assigned" bit is set for this entry. 2746 * There is no way with this data structure for fallback output 2747 * to be a zero byte. 2748 */ 2749 2750unassigned: 2751 /* try an extension mapping */ 2752 pArgs->source=source; 2753 c=_extFromU(cnv, cnv->sharedData, 2754 c, &source, sourceLimit, 2755 &target, target+targetCapacity, 2756 &offsets, sourceIndex, 2757 pArgs->flush, 2758 pErrorCode); 2759 nextSourceIndex+=(int32_t)(source-pArgs->source); 2760 2761 if(U_FAILURE(*pErrorCode)) { 2762 /* not mappable or buffer overflow */ 2763 break; 2764 } else { 2765 /* a mapping was written to the target, continue */ 2766 2767 /* recalculate the targetCapacity after an extension mapping */ 2768 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 2769 2770 /* normal end of conversion: prepare for a new character */ 2771 sourceIndex=nextSourceIndex; 2772 continue; 2773 } 2774 } 2775 } 2776 2777 /* write the output character bytes from value and length */ 2778 /* from the first if in the loop we know that targetCapacity>0 */ 2779 if(value<=0xff) { 2780 /* this is easy because we know that there is enough space */ 2781 *target++=(uint8_t)value; 2782 if(offsets!=NULL) { 2783 *offsets++=sourceIndex; 2784 } 2785 --targetCapacity; 2786 } else /* length==2 */ { 2787 *target++=(uint8_t)(value>>8); 2788 if(2<=targetCapacity) { 2789 *target++=(uint8_t)value; 2790 if(offsets!=NULL) { 2791 *offsets++=sourceIndex; 2792 *offsets++=sourceIndex; 2793 } 2794 targetCapacity-=2; 2795 } else { 2796 if(offsets!=NULL) { 2797 *offsets++=sourceIndex; 2798 } 2799 cnv->charErrorBuffer[0]=(char)value; 2800 cnv->charErrorBufferLength=1; 2801 2802 /* target overflow */ 2803 targetCapacity=0; 2804 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2805 c=0; 2806 break; 2807 } 2808 } 2809 2810 /* normal end of conversion: prepare for a new character */ 2811 c=0; 2812 sourceIndex=nextSourceIndex; 2813 continue; 2814 } else { 2815 /* target is full */ 2816 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2817 break; 2818 } 2819 } 2820 2821 /* set the converter state back into UConverter */ 2822 cnv->fromUChar32=c; 2823 2824 /* write back the updated pointers */ 2825 pArgs->source=source; 2826 pArgs->target=(char *)target; 2827 pArgs->offsets=offsets; 2828} 2829 2830/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 2831static void 2832ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 2833 UErrorCode *pErrorCode) { 2834 UConverter *cnv; 2835 const UChar *source, *sourceLimit; 2836 uint8_t *target; 2837 int32_t targetCapacity; 2838 int32_t *offsets; 2839 2840 const uint16_t *table; 2841 const uint16_t *results; 2842 2843 UChar32 c; 2844 2845 int32_t sourceIndex, nextSourceIndex; 2846 2847 uint16_t value, minValue; 2848 UBool hasSupplementary; 2849 2850 /* set up the local pointers */ 2851 cnv=pArgs->converter; 2852 source=pArgs->source; 2853 sourceLimit=pArgs->sourceLimit; 2854 target=(uint8_t *)pArgs->target; 2855 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2856 offsets=pArgs->offsets; 2857 2858 table=cnv->sharedData->mbcs.fromUnicodeTable; 2859 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2860 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 2861 } else { 2862 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 2863 } 2864 2865 if(cnv->useFallback) { 2866 /* use all roundtrip and fallback results */ 2867 minValue=0x800; 2868 } else { 2869 /* use only roundtrips and fallbacks from private-use characters */ 2870 minValue=0xc00; 2871 } 2872 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 2873 2874 /* get the converter state from UConverter */ 2875 c=cnv->fromUChar32; 2876 2877 /* sourceIndex=-1 if the current character began in the previous buffer */ 2878 sourceIndex= c==0 ? 0 : -1; 2879 nextSourceIndex=0; 2880 2881 /* conversion loop */ 2882 if(c!=0 && targetCapacity>0) { 2883 goto getTrail; 2884 } 2885 2886 while(source<sourceLimit) { 2887 /* 2888 * This following test is to see if available input would overflow the output. 2889 * It does not catch output of more than one byte that 2890 * overflows as a result of a multi-byte character or callback output 2891 * from the last source character. 2892 * Therefore, those situations also test for overflows and will 2893 * then break the loop, too. 2894 */ 2895 if(targetCapacity>0) { 2896 /* 2897 * Get a correct Unicode code point: 2898 * a single UChar for a BMP code point or 2899 * a matched surrogate pair for a "supplementary code point". 2900 */ 2901 c=*source++; 2902 ++nextSourceIndex; 2903 if(UTF_IS_SURROGATE(c)) { 2904 if(UTF_IS_SURROGATE_FIRST(c)) { 2905getTrail: 2906 if(source<sourceLimit) { 2907 /* test the following code unit */ 2908 UChar trail=*source; 2909 if(UTF_IS_SECOND_SURROGATE(trail)) { 2910 ++source; 2911 ++nextSourceIndex; 2912 c=UTF16_GET_PAIR_VALUE(c, trail); 2913 if(!hasSupplementary) { 2914 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 2915 /* callback(unassigned) */ 2916 goto unassigned; 2917 } 2918 /* convert this supplementary code point */ 2919 /* exit this condition tree */ 2920 } else { 2921 /* this is an unmatched lead code unit (1st surrogate) */ 2922 /* callback(illegal) */ 2923 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2924 break; 2925 } 2926 } else { 2927 /* no more input */ 2928 break; 2929 } 2930 } else { 2931 /* this is an unmatched trail code unit (2nd surrogate) */ 2932 /* callback(illegal) */ 2933 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2934 break; 2935 } 2936 } 2937 2938 /* convert the Unicode code point in c into codepage bytes */ 2939 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 2940 2941 /* is this code point assigned, or do we use fallbacks? */ 2942 if(value>=minValue) { 2943 /* assigned, write the output character bytes from value and length */ 2944 /* length==1 */ 2945 /* this is easy because we know that there is enough space */ 2946 *target++=(uint8_t)value; 2947 if(offsets!=NULL) { 2948 *offsets++=sourceIndex; 2949 } 2950 --targetCapacity; 2951 2952 /* normal end of conversion: prepare for a new character */ 2953 c=0; 2954 sourceIndex=nextSourceIndex; 2955 } else { /* unassigned */ 2956unassigned: 2957 /* try an extension mapping */ 2958 pArgs->source=source; 2959 c=_extFromU(cnv, cnv->sharedData, 2960 c, &source, sourceLimit, 2961 &target, target+targetCapacity, 2962 &offsets, sourceIndex, 2963 pArgs->flush, 2964 pErrorCode); 2965 nextSourceIndex+=(int32_t)(source-pArgs->source); 2966 2967 if(U_FAILURE(*pErrorCode)) { 2968 /* not mappable or buffer overflow */ 2969 break; 2970 } else { 2971 /* a mapping was written to the target, continue */ 2972 2973 /* recalculate the targetCapacity after an extension mapping */ 2974 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 2975 2976 /* normal end of conversion: prepare for a new character */ 2977 sourceIndex=nextSourceIndex; 2978 } 2979 } 2980 } else { 2981 /* target is full */ 2982 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2983 break; 2984 } 2985 } 2986 2987 /* set the converter state back into UConverter */ 2988 cnv->fromUChar32=c; 2989 2990 /* write back the updated pointers */ 2991 pArgs->source=source; 2992 pArgs->target=(char *)target; 2993 pArgs->offsets=offsets; 2994} 2995 2996/* 2997 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 2998 * that map only to and from the BMP. 2999 * In addition to single-byte/state optimizations, the offset calculations 3000 * become much easier. 3001 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3002 * but measurements have shown that this diminishes performance 3003 * in more cases than it improves it. 3004 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3005 * for various MBCS and SBCS optimizations. 3006 */ 3007static void 3008ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3009 UErrorCode *pErrorCode) { 3010 UConverter *cnv; 3011 const UChar *source, *sourceLimit, *lastSource; 3012 uint8_t *target; 3013 int32_t targetCapacity, length; 3014 int32_t *offsets; 3015 3016 const uint16_t *table; 3017 const uint16_t *results; 3018 3019 UChar32 c; 3020 3021 int32_t sourceIndex; 3022 3023 uint32_t asciiRoundtrips; 3024 uint16_t value, minValue; 3025 3026 /* set up the local pointers */ 3027 cnv=pArgs->converter; 3028 source=pArgs->source; 3029 sourceLimit=pArgs->sourceLimit; 3030 target=(uint8_t *)pArgs->target; 3031 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3032 offsets=pArgs->offsets; 3033 3034 table=cnv->sharedData->mbcs.fromUnicodeTable; 3035 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3036 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3037 } else { 3038 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3039 } 3040 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3041 3042 if(cnv->useFallback) { 3043 /* use all roundtrip and fallback results */ 3044 minValue=0x800; 3045 } else { 3046 /* use only roundtrips and fallbacks from private-use characters */ 3047 minValue=0xc00; 3048 } 3049 3050 /* get the converter state from UConverter */ 3051 c=cnv->fromUChar32; 3052 3053 /* sourceIndex=-1 if the current character began in the previous buffer */ 3054 sourceIndex= c==0 ? 0 : -1; 3055 lastSource=source; 3056 3057 /* 3058 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3059 * for the minimum of the sourceLength and targetCapacity 3060 */ 3061 length=(int32_t)(sourceLimit-source); 3062 if(length<targetCapacity) { 3063 targetCapacity=length; 3064 } 3065 3066 /* conversion loop */ 3067 if(c!=0 && targetCapacity>0) { 3068 goto getTrail; 3069 } 3070 3071#if MBCS_UNROLL_SINGLE_FROM_BMP 3072 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3073 /* unroll the loop with the most common case */ 3074unrolled: 3075 if(targetCapacity>=4) { 3076 int32_t count, loops; 3077 uint16_t andedValues; 3078 3079 loops=count=targetCapacity>>2; 3080 do { 3081 c=*source++; 3082 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3083 *target++=(uint8_t)value; 3084 c=*source++; 3085 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3086 *target++=(uint8_t)value; 3087 c=*source++; 3088 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3089 *target++=(uint8_t)value; 3090 c=*source++; 3091 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3092 *target++=(uint8_t)value; 3093 3094 /* were all 4 entries really valid? */ 3095 if(andedValues<minValue) { 3096 /* no, return to the first of these 4 */ 3097 source-=4; 3098 target-=4; 3099 break; 3100 } 3101 } while(--count>0); 3102 count=loops-count; 3103 targetCapacity-=4*count; 3104 3105 if(offsets!=NULL) { 3106 lastSource+=4*count; 3107 while(count>0) { 3108 *offsets++=sourceIndex++; 3109 *offsets++=sourceIndex++; 3110 *offsets++=sourceIndex++; 3111 *offsets++=sourceIndex++; 3112 --count; 3113 } 3114 } 3115 3116 c=0; 3117 } 3118#endif 3119 3120 while(targetCapacity>0) { 3121 /* 3122 * Get a correct Unicode code point: 3123 * a single UChar for a BMP code point or 3124 * a matched surrogate pair for a "supplementary code point". 3125 */ 3126 c=*source++; 3127 /* 3128 * Do not immediately check for single surrogates: 3129 * Assume that they are unassigned and check for them in that case. 3130 * This speeds up the conversion of assigned characters. 3131 */ 3132 /* convert the Unicode code point in c into codepage bytes */ 3133 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3134 *target++=(uint8_t)c; 3135 --targetCapacity; 3136 c=0; 3137 continue; 3138 } 3139 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3140 /* is this code point assigned, or do we use fallbacks? */ 3141 if(value>=minValue) { 3142 /* assigned, write the output character bytes from value and length */ 3143 /* length==1 */ 3144 /* this is easy because we know that there is enough space */ 3145 *target++=(uint8_t)value; 3146 --targetCapacity; 3147 3148 /* normal end of conversion: prepare for a new character */ 3149 c=0; 3150 continue; 3151 } else if(!UTF_IS_SURROGATE(c)) { 3152 /* normal, unassigned BMP character */ 3153 } else if(UTF_IS_SURROGATE_FIRST(c)) { 3154getTrail: 3155 if(source<sourceLimit) { 3156 /* test the following code unit */ 3157 UChar trail=*source; 3158 if(UTF_IS_SECOND_SURROGATE(trail)) { 3159 ++source; 3160 c=UTF16_GET_PAIR_VALUE(c, trail); 3161 /* this codepage does not map supplementary code points */ 3162 /* callback(unassigned) */ 3163 } else { 3164 /* this is an unmatched lead code unit (1st surrogate) */ 3165 /* callback(illegal) */ 3166 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3167 break; 3168 } 3169 } else { 3170 /* no more input */ 3171 if (pArgs->flush) { 3172 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3173 } 3174 break; 3175 } 3176 } else { 3177 /* this is an unmatched trail code unit (2nd surrogate) */ 3178 /* callback(illegal) */ 3179 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3180 break; 3181 } 3182 3183 /* c does not have a mapping */ 3184 3185 /* get the number of code units for c to correctly advance sourceIndex */ 3186 length=U16_LENGTH(c); 3187 3188 /* set offsets since the start or the last extension */ 3189 if(offsets!=NULL) { 3190 int32_t count=(int32_t)(source-lastSource); 3191 3192 /* do not set the offset for this character */ 3193 count-=length; 3194 3195 while(count>0) { 3196 *offsets++=sourceIndex++; 3197 --count; 3198 } 3199 /* offsets and sourceIndex are now set for the current character */ 3200 } 3201 3202 /* try an extension mapping */ 3203 lastSource=source; 3204 c=_extFromU(cnv, cnv->sharedData, 3205 c, &source, sourceLimit, 3206 &target, target+targetCapacity, 3207 &offsets, sourceIndex, 3208 pArgs->flush, 3209 pErrorCode); 3210 sourceIndex+=length+(int32_t)(source-lastSource); 3211 lastSource=source; 3212 3213 if(U_FAILURE(*pErrorCode)) { 3214 /* not mappable or buffer overflow */ 3215 break; 3216 } else { 3217 /* a mapping was written to the target, continue */ 3218 3219 /* recalculate the targetCapacity after an extension mapping */ 3220 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3221 length=(int32_t)(sourceLimit-source); 3222 if(length<targetCapacity) { 3223 targetCapacity=length; 3224 } 3225 } 3226 3227#if MBCS_UNROLL_SINGLE_FROM_BMP 3228 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3229 goto unrolled; 3230#endif 3231 } 3232 3233 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 3234 /* target is full */ 3235 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3236 } 3237 3238 /* set offsets since the start or the last callback */ 3239 if(offsets!=NULL) { 3240 size_t count=source-lastSource; 3241 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 3242 /* 3243 Caller gave us a partial supplementary character, 3244 which this function couldn't convert in any case. 3245 The callback will handle the offset. 3246 */ 3247 count--; 3248 } 3249 while(count>0) { 3250 *offsets++=sourceIndex++; 3251 --count; 3252 } 3253 } 3254 3255 /* set the converter state back into UConverter */ 3256 cnv->fromUChar32=c; 3257 3258 /* write back the updated pointers */ 3259 pArgs->source=source; 3260 pArgs->target=(char *)target; 3261 pArgs->offsets=offsets; 3262} 3263 3264U_CFUNC void 3265ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3266 UErrorCode *pErrorCode) { 3267 UConverter *cnv; 3268 const UChar *source, *sourceLimit; 3269 uint8_t *target; 3270 int32_t targetCapacity; 3271 int32_t *offsets; 3272 3273 const uint16_t *table; 3274 const uint16_t *mbcsIndex; 3275 const uint8_t *p, *bytes; 3276 uint8_t outputType; 3277 3278 UChar32 c; 3279 3280 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 3281 3282 uint32_t stage2Entry; 3283 uint32_t asciiRoundtrips; 3284 uint32_t value; 3285 int32_t length, prevLength; 3286 uint8_t unicodeMask; 3287 3288 cnv=pArgs->converter; 3289 3290 if(cnv->preFromUFirstCP>=0) { 3291 /* 3292 * pass sourceIndex=-1 because we continue from an earlier buffer 3293 * in the future, this may change with continuous offsets 3294 */ 3295 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 3296 3297 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 3298 return; 3299 } 3300 } 3301 3302 /* use optimized function if possible */ 3303 outputType=cnv->sharedData->mbcs.outputType; 3304 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3305 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3306 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3307 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 3308 } else { 3309 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 3310 } 3311 return; 3312 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 3313 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 3314 return; 3315 } 3316 3317 /* set up the local pointers */ 3318 source=pArgs->source; 3319 sourceLimit=pArgs->sourceLimit; 3320 target=(uint8_t *)pArgs->target; 3321 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3322 offsets=pArgs->offsets; 3323 3324 table=cnv->sharedData->mbcs.fromUnicodeTable; 3325 if(cnv->sharedData->mbcs.utf8Friendly) { 3326 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3327 } else { 3328 mbcsIndex=NULL; 3329 } 3330 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3331 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3332 } else { 3333 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3334 } 3335 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3336 3337 /* get the converter state from UConverter */ 3338 c=cnv->fromUChar32; 3339 3340 if(outputType==MBCS_OUTPUT_2_SISO) { 3341 prevLength=cnv->fromUnicodeStatus; 3342 if(prevLength==0) { 3343 /* set the real value */ 3344 prevLength=1; 3345 } 3346 } else { 3347 /* prevent fromUnicodeStatus from being set to something non-0 */ 3348 prevLength=0; 3349 } 3350 3351 /* sourceIndex=-1 if the current character began in the previous buffer */ 3352 prevSourceIndex=-1; 3353 sourceIndex= c==0 ? 0 : -1; 3354 nextSourceIndex=0; 3355 3356 /* conversion loop */ 3357 /* 3358 * This is another piece of ugly code: 3359 * A goto into the loop if the converter state contains a first surrogate 3360 * from the previous function call. 3361 * It saves me to check in each loop iteration a check of if(c==0) 3362 * and duplicating the trail-surrogate-handling code in the else 3363 * branch of that check. 3364 * I could not find any other way to get around this other than 3365 * using a function call for the conversion and callback, which would 3366 * be even more inefficient. 3367 * 3368 * Markus Scherer 2000-jul-19 3369 */ 3370 if(c!=0 && targetCapacity>0) { 3371 goto getTrail; 3372 } 3373 3374 while(source<sourceLimit) { 3375 /* 3376 * This following test is to see if available input would overflow the output. 3377 * It does not catch output of more than one byte that 3378 * overflows as a result of a multi-byte character or callback output 3379 * from the last source character. 3380 * Therefore, those situations also test for overflows and will 3381 * then break the loop, too. 3382 */ 3383 if(targetCapacity>0) { 3384 /* 3385 * Get a correct Unicode code point: 3386 * a single UChar for a BMP code point or 3387 * a matched surrogate pair for a "supplementary code point". 3388 */ 3389 c=*source++; 3390 ++nextSourceIndex; 3391 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3392 *target++=(uint8_t)c; 3393 if(offsets!=NULL) { 3394 *offsets++=sourceIndex; 3395 prevSourceIndex=sourceIndex; 3396 sourceIndex=nextSourceIndex; 3397 } 3398 --targetCapacity; 3399 c=0; 3400 continue; 3401 } 3402 /* 3403 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3404 * to avoid dealing with surrogates. 3405 * MBCS_FAST_MAX must be >=0xd7ff. 3406 */ 3407 if(c<=0xd7ff && mbcsIndex!=NULL) { 3408 value=mbcsIndex[c>>6]; 3409 3410 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 3411 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3412 switch(outputType) { 3413 case MBCS_OUTPUT_2: 3414 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 3415 if(value<=0xff) { 3416 if(value==0) { 3417 goto unassigned; 3418 } else { 3419 length=1; 3420 } 3421 } else { 3422 length=2; 3423 } 3424 break; 3425 case MBCS_OUTPUT_2_SISO: 3426 /* 1/2-byte stateful with Shift-In/Shift-Out */ 3427 /* 3428 * Save the old state in the converter object 3429 * right here, then change the local prevLength state variable if necessary. 3430 * Then, if this character turns out to be unassigned or a fallback that 3431 * is not taken, the callback code must not save the new state in the converter 3432 * because the new state is for a character that is not output. 3433 * However, the callback must still restore the state from the converter 3434 * in case the callback function changed it for its output. 3435 */ 3436 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 3437 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 3438 if(value<=0xff) { 3439 if(value==0) { 3440 goto unassigned; 3441 } else if(prevLength<=1) { 3442 length=1; 3443 } else { 3444 /* change from double-byte mode to single-byte */ 3445 value|=(uint32_t)UCNV_SI<<8; 3446 length=2; 3447 prevLength=1; 3448 } 3449 } else { 3450 if(prevLength==2) { 3451 length=2; 3452 } else { 3453 /* change from single-byte mode to double-byte */ 3454 value|=(uint32_t)UCNV_SO<<16; 3455 length=3; 3456 prevLength=2; 3457 } 3458 } 3459 break; 3460 case MBCS_OUTPUT_DBCS_ONLY: 3461 /* table with single-byte results, but only DBCS mappings used */ 3462 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 3463 if(value<=0xff) { 3464 /* no mapping or SBCS result, not taken for DBCS-only */ 3465 goto unassigned; 3466 } else { 3467 length=2; 3468 } 3469 break; 3470 case MBCS_OUTPUT_3: 3471 p=bytes+(value+(c&0x3f))*3; 3472 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3473 if(value<=0xff) { 3474 if(value==0) { 3475 goto unassigned; 3476 } else { 3477 length=1; 3478 } 3479 } else if(value<=0xffff) { 3480 length=2; 3481 } else { 3482 length=3; 3483 } 3484 break; 3485 case MBCS_OUTPUT_4: 3486 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 3487 if(value<=0xff) { 3488 if(value==0) { 3489 goto unassigned; 3490 } else { 3491 length=1; 3492 } 3493 } else if(value<=0xffff) { 3494 length=2; 3495 } else if(value<=0xffffff) { 3496 length=3; 3497 } else { 3498 length=4; 3499 } 3500 break; 3501 case MBCS_OUTPUT_3_EUC: 3502 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 3503 /* EUC 16-bit fixed-length representation */ 3504 if(value<=0xff) { 3505 if(value==0) { 3506 goto unassigned; 3507 } else { 3508 length=1; 3509 } 3510 } else if((value&0x8000)==0) { 3511 value|=0x8e8000; 3512 length=3; 3513 } else if((value&0x80)==0) { 3514 value|=0x8f0080; 3515 length=3; 3516 } else { 3517 length=2; 3518 } 3519 break; 3520 case MBCS_OUTPUT_4_EUC: 3521 p=bytes+(value+(c&0x3f))*3; 3522 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3523 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3524 if(value<=0xff) { 3525 if(value==0) { 3526 goto unassigned; 3527 } else { 3528 length=1; 3529 } 3530 } else if(value<=0xffff) { 3531 length=2; 3532 } else if((value&0x800000)==0) { 3533 value|=0x8e800000; 3534 length=4; 3535 } else if((value&0x8000)==0) { 3536 value|=0x8f008000; 3537 length=4; 3538 } else { 3539 length=3; 3540 } 3541 break; 3542 default: 3543 /* must not occur */ 3544 /* 3545 * To avoid compiler warnings that value & length may be 3546 * used without having been initialized, we set them here. 3547 * In reality, this is unreachable code. 3548 * Not having a default branch also causes warnings with 3549 * some compilers. 3550 */ 3551 value=0; 3552 length=0; 3553 break; 3554 } 3555 /* output the value */ 3556 } else { 3557 /* 3558 * This also tests if the codepage maps single surrogates. 3559 * If it does, then surrogates are not paired but mapped separately. 3560 * Note that in this case unmatched surrogates are not detected. 3561 */ 3562 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3563 if(UTF_IS_SURROGATE_FIRST(c)) { 3564getTrail: 3565 if(source<sourceLimit) { 3566 /* test the following code unit */ 3567 UChar trail=*source; 3568 if(UTF_IS_SECOND_SURROGATE(trail)) { 3569 ++source; 3570 ++nextSourceIndex; 3571 c=UTF16_GET_PAIR_VALUE(c, trail); 3572 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3573 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3574 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 3575 /* callback(unassigned) */ 3576 goto unassigned; 3577 } 3578 /* convert this supplementary code point */ 3579 /* exit this condition tree */ 3580 } else { 3581 /* this is an unmatched lead code unit (1st surrogate) */ 3582 /* callback(illegal) */ 3583 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3584 break; 3585 } 3586 } else { 3587 /* no more input */ 3588 break; 3589 } 3590 } else { 3591 /* this is an unmatched trail code unit (2nd surrogate) */ 3592 /* callback(illegal) */ 3593 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3594 break; 3595 } 3596 } 3597 3598 /* convert the Unicode code point in c into codepage bytes */ 3599 3600 /* 3601 * The basic lookup is a triple-stage compact array (trie) lookup. 3602 * For details see the beginning of this file. 3603 * 3604 * Single-byte codepages are handled with a different data structure 3605 * by _MBCSSingle... functions. 3606 * 3607 * The result consists of a 32-bit value from stage 2 and 3608 * a pointer to as many bytes as are stored per character. 3609 * The pointer points to the character's bytes in stage 3. 3610 * Bits 15..0 of the stage 2 entry contain the stage 3 index 3611 * for that pointer, while bits 31..16 are flags for which of 3612 * the 16 characters in the block are roundtrip-assigned. 3613 * 3614 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 3615 * respectively as uint32_t, in the platform encoding. 3616 * For 3-byte codepages, the bytes are always stored in big-endian order. 3617 * 3618 * For EUC encodings that use only either 0x8e or 0x8f as the first 3619 * byte of their longest byte sequences, the first two bytes in 3620 * this third stage indicate with their 7th bits whether these bytes 3621 * are to be written directly or actually need to be preceeded by 3622 * one of the two Single-Shift codes. With this, the third stage 3623 * stores one byte fewer per character than the actual maximum length of 3624 * EUC byte sequences. 3625 * 3626 * Other than that, leading zero bytes are removed and the other 3627 * bytes output. A single zero byte may be output if the "assigned" 3628 * bit in stage 2 was on. 3629 * The data structure does not support zero byte output as a fallback, 3630 * and also does not allow output of leading zeros. 3631 */ 3632 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3633 3634 /* get the bytes and the length for the output */ 3635 switch(outputType) { 3636 case MBCS_OUTPUT_2: 3637 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3638 if(value<=0xff) { 3639 length=1; 3640 } else { 3641 length=2; 3642 } 3643 break; 3644 case MBCS_OUTPUT_2_SISO: 3645 /* 1/2-byte stateful with Shift-In/Shift-Out */ 3646 /* 3647 * Save the old state in the converter object 3648 * right here, then change the local prevLength state variable if necessary. 3649 * Then, if this character turns out to be unassigned or a fallback that 3650 * is not taken, the callback code must not save the new state in the converter 3651 * because the new state is for a character that is not output. 3652 * However, the callback must still restore the state from the converter 3653 * in case the callback function changed it for its output. 3654 */ 3655 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 3656 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3657 if(value<=0xff) { 3658 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 3659 /* no mapping, leave value==0 */ 3660 length=0; 3661 } else if(prevLength<=1) { 3662 length=1; 3663 } else { 3664 /* change from double-byte mode to single-byte */ 3665 value|=(uint32_t)UCNV_SI<<8; 3666 length=2; 3667 prevLength=1; 3668 } 3669 } else { 3670 if(prevLength==2) { 3671 length=2; 3672 } else { 3673 /* change from single-byte mode to double-byte */ 3674 value|=(uint32_t)UCNV_SO<<16; 3675 length=3; 3676 prevLength=2; 3677 } 3678 } 3679 break; 3680 case MBCS_OUTPUT_DBCS_ONLY: 3681 /* table with single-byte results, but only DBCS mappings used */ 3682 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3683 if(value<=0xff) { 3684 /* no mapping or SBCS result, not taken for DBCS-only */ 3685 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 3686 length=0; 3687 } else { 3688 length=2; 3689 } 3690 break; 3691 case MBCS_OUTPUT_3: 3692 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3693 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3694 if(value<=0xff) { 3695 length=1; 3696 } else if(value<=0xffff) { 3697 length=2; 3698 } else { 3699 length=3; 3700 } 3701 break; 3702 case MBCS_OUTPUT_4: 3703 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 3704 if(value<=0xff) { 3705 length=1; 3706 } else if(value<=0xffff) { 3707 length=2; 3708 } else if(value<=0xffffff) { 3709 length=3; 3710 } else { 3711 length=4; 3712 } 3713 break; 3714 case MBCS_OUTPUT_3_EUC: 3715 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3716 /* EUC 16-bit fixed-length representation */ 3717 if(value<=0xff) { 3718 length=1; 3719 } else if((value&0x8000)==0) { 3720 value|=0x8e8000; 3721 length=3; 3722 } else if((value&0x80)==0) { 3723 value|=0x8f0080; 3724 length=3; 3725 } else { 3726 length=2; 3727 } 3728 break; 3729 case MBCS_OUTPUT_4_EUC: 3730 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3731 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3732 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3733 if(value<=0xff) { 3734 length=1; 3735 } else if(value<=0xffff) { 3736 length=2; 3737 } else if((value&0x800000)==0) { 3738 value|=0x8e800000; 3739 length=4; 3740 } else if((value&0x8000)==0) { 3741 value|=0x8f008000; 3742 length=4; 3743 } else { 3744 length=3; 3745 } 3746 break; 3747 default: 3748 /* must not occur */ 3749 /* 3750 * To avoid compiler warnings that value & length may be 3751 * used without having been initialized, we set them here. 3752 * In reality, this is unreachable code. 3753 * Not having a default branch also causes warnings with 3754 * some compilers. 3755 */ 3756 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 3757 length=0; 3758 break; 3759 } 3760 3761 /* is this code point assigned, or do we use fallbacks? */ 3762 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 3763 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3764 ) { 3765 /* 3766 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3767 * There is no way with this data structure for fallback output 3768 * to be a zero byte. 3769 */ 3770 3771unassigned: 3772 /* try an extension mapping */ 3773 pArgs->source=source; 3774 c=_extFromU(cnv, cnv->sharedData, 3775 c, &source, sourceLimit, 3776 &target, target+targetCapacity, 3777 &offsets, sourceIndex, 3778 pArgs->flush, 3779 pErrorCode); 3780 nextSourceIndex+=(int32_t)(source-pArgs->source); 3781 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 3782 3783 if(U_FAILURE(*pErrorCode)) { 3784 /* not mappable or buffer overflow */ 3785 break; 3786 } else { 3787 /* a mapping was written to the target, continue */ 3788 3789 /* recalculate the targetCapacity after an extension mapping */ 3790 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3791 3792 /* normal end of conversion: prepare for a new character */ 3793 if(offsets!=NULL) { 3794 prevSourceIndex=sourceIndex; 3795 sourceIndex=nextSourceIndex; 3796 } 3797 continue; 3798 } 3799 } 3800 } 3801 3802 /* write the output character bytes from value and length */ 3803 /* from the first if in the loop we know that targetCapacity>0 */ 3804 if(length<=targetCapacity) { 3805 if(offsets==NULL) { 3806 switch(length) { 3807 /* each branch falls through to the next one */ 3808 case 4: 3809 *target++=(uint8_t)(value>>24); 3810 case 3: 3811 *target++=(uint8_t)(value>>16); 3812 case 2: 3813 *target++=(uint8_t)(value>>8); 3814 case 1: 3815 *target++=(uint8_t)value; 3816 default: 3817 /* will never occur */ 3818 break; 3819 } 3820 } else { 3821 switch(length) { 3822 /* each branch falls through to the next one */ 3823 case 4: 3824 *target++=(uint8_t)(value>>24); 3825 *offsets++=sourceIndex; 3826 case 3: 3827 *target++=(uint8_t)(value>>16); 3828 *offsets++=sourceIndex; 3829 case 2: 3830 *target++=(uint8_t)(value>>8); 3831 *offsets++=sourceIndex; 3832 case 1: 3833 *target++=(uint8_t)value; 3834 *offsets++=sourceIndex; 3835 default: 3836 /* will never occur */ 3837 break; 3838 } 3839 } 3840 targetCapacity-=length; 3841 } else { 3842 uint8_t *charErrorBuffer; 3843 3844 /* 3845 * We actually do this backwards here: 3846 * In order to save an intermediate variable, we output 3847 * first to the overflow buffer what does not fit into the 3848 * regular target. 3849 */ 3850 /* we know that 1<=targetCapacity<length<=4 */ 3851 length-=targetCapacity; 3852 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 3853 switch(length) { 3854 /* each branch falls through to the next one */ 3855 case 3: 3856 *charErrorBuffer++=(uint8_t)(value>>16); 3857 case 2: 3858 *charErrorBuffer++=(uint8_t)(value>>8); 3859 case 1: 3860 *charErrorBuffer=(uint8_t)value; 3861 default: 3862 /* will never occur */ 3863 break; 3864 } 3865 cnv->charErrorBufferLength=(int8_t)length; 3866 3867 /* now output what fits into the regular target */ 3868 value>>=8*length; /* length was reduced by targetCapacity */ 3869 switch(targetCapacity) { 3870 /* each branch falls through to the next one */ 3871 case 3: 3872 *target++=(uint8_t)(value>>16); 3873 if(offsets!=NULL) { 3874 *offsets++=sourceIndex; 3875 } 3876 case 2: 3877 *target++=(uint8_t)(value>>8); 3878 if(offsets!=NULL) { 3879 *offsets++=sourceIndex; 3880 } 3881 case 1: 3882 *target++=(uint8_t)value; 3883 if(offsets!=NULL) { 3884 *offsets++=sourceIndex; 3885 } 3886 default: 3887 /* will never occur */ 3888 break; 3889 } 3890 3891 /* target overflow */ 3892 targetCapacity=0; 3893 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3894 c=0; 3895 break; 3896 } 3897 3898 /* normal end of conversion: prepare for a new character */ 3899 c=0; 3900 if(offsets!=NULL) { 3901 prevSourceIndex=sourceIndex; 3902 sourceIndex=nextSourceIndex; 3903 } 3904 continue; 3905 } else { 3906 /* target is full */ 3907 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3908 break; 3909 } 3910 } 3911 3912 /* 3913 * the end of the input stream and detection of truncated input 3914 * are handled by the framework, but for EBCDIC_STATEFUL conversion 3915 * we need to emit an SI at the very end 3916 * 3917 * conditions: 3918 * successful 3919 * EBCDIC_STATEFUL in DBCS mode 3920 * end of input and no truncated input 3921 */ 3922 if( U_SUCCESS(*pErrorCode) && 3923 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 3924 pArgs->flush && source>=sourceLimit && c==0 3925 ) { 3926 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 3927 if(targetCapacity>0) { 3928 *target++=(uint8_t)UCNV_SI; 3929 if(offsets!=NULL) { 3930 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 3931 *offsets++=prevSourceIndex; 3932 } 3933 } else { 3934 /* target is full */ 3935 cnv->charErrorBuffer[0]=(char)UCNV_SI; 3936 cnv->charErrorBufferLength=1; 3937 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3938 } 3939 prevLength=1; /* we switched into SBCS */ 3940 } 3941 3942 /* set the converter state back into UConverter */ 3943 cnv->fromUChar32=c; 3944 cnv->fromUnicodeStatus=prevLength; 3945 3946 /* write back the updated pointers */ 3947 pArgs->source=source; 3948 pArgs->target=(char *)target; 3949 pArgs->offsets=offsets; 3950} 3951 3952/* 3953 * This is another simple conversion function for internal use by other 3954 * conversion implementations. 3955 * It does not use the converter state nor call callbacks. 3956 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3957 * It handles conversion extensions but not GB 18030. 3958 * 3959 * It converts one single Unicode code point into codepage bytes, encoded 3960 * as one 32-bit value. The function returns the number of bytes in *pValue: 3961 * 1..4 the number of bytes in *pValue 3962 * 0 unassigned (*pValue undefined) 3963 * -1 illegal (currently not used, *pValue undefined) 3964 * 3965 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 3966 * the second to last byte in bits 15..8, etc. 3967 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 3968 */ 3969U_CFUNC int32_t 3970ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 3971 UChar32 c, uint32_t *pValue, 3972 UBool useFallback) { 3973 const int32_t *cx; 3974 const uint16_t *table; 3975#if 0 3976/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 3977 const uint8_t *p; 3978#endif 3979 uint32_t stage2Entry; 3980 uint32_t value; 3981 int32_t length; 3982 3983 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3984 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3985 table=sharedData->mbcs.fromUnicodeTable; 3986 3987 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 3988 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 3989 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 3990 /* is this code point assigned, or do we use fallbacks? */ 3991 if(useFallback ? value>=0x800 : value>=0xc00) { 3992 *pValue=value&0xff; 3993 return 1; 3994 } 3995 } else /* outputType!=MBCS_OUTPUT_1 */ { 3996 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3997 3998 /* get the bytes and the length for the output */ 3999 switch(sharedData->mbcs.outputType) { 4000 case MBCS_OUTPUT_2: 4001 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4002 if(value<=0xff) { 4003 length=1; 4004 } else { 4005 length=2; 4006 } 4007 break; 4008#if 0 4009/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4010 case MBCS_OUTPUT_DBCS_ONLY: 4011 /* table with single-byte results, but only DBCS mappings used */ 4012 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4013 if(value<=0xff) { 4014 /* no mapping or SBCS result, not taken for DBCS-only */ 4015 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4016 length=0; 4017 } else { 4018 length=2; 4019 } 4020 break; 4021 case MBCS_OUTPUT_3: 4022 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4023 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4024 if(value<=0xff) { 4025 length=1; 4026 } else if(value<=0xffff) { 4027 length=2; 4028 } else { 4029 length=3; 4030 } 4031 break; 4032 case MBCS_OUTPUT_4: 4033 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4034 if(value<=0xff) { 4035 length=1; 4036 } else if(value<=0xffff) { 4037 length=2; 4038 } else if(value<=0xffffff) { 4039 length=3; 4040 } else { 4041 length=4; 4042 } 4043 break; 4044 case MBCS_OUTPUT_3_EUC: 4045 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4046 /* EUC 16-bit fixed-length representation */ 4047 if(value<=0xff) { 4048 length=1; 4049 } else if((value&0x8000)==0) { 4050 value|=0x8e8000; 4051 length=3; 4052 } else if((value&0x80)==0) { 4053 value|=0x8f0080; 4054 length=3; 4055 } else { 4056 length=2; 4057 } 4058 break; 4059 case MBCS_OUTPUT_4_EUC: 4060 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4061 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4062 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4063 if(value<=0xff) { 4064 length=1; 4065 } else if(value<=0xffff) { 4066 length=2; 4067 } else if((value&0x800000)==0) { 4068 value|=0x8e800000; 4069 length=4; 4070 } else if((value&0x8000)==0) { 4071 value|=0x8f008000; 4072 length=4; 4073 } else { 4074 length=3; 4075 } 4076 break; 4077#endif 4078 default: 4079 /* must not occur */ 4080 return -1; 4081 } 4082 4083 /* is this code point assigned, or do we use fallbacks? */ 4084 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4085 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4086 ) { 4087 /* 4088 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4089 * There is no way with this data structure for fallback output 4090 * to be a zero byte. 4091 */ 4092 /* assigned */ 4093 *pValue=value; 4094 return length; 4095 } 4096 } 4097 } 4098 4099 cx=sharedData->mbcs.extIndexes; 4100 if(cx!=NULL) { 4101 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4102 return length>=0 ? length : -length; /* return abs(length); */ 4103 } 4104 4105 /* unassigned */ 4106 return 0; 4107} 4108 4109 4110#if 0 4111/* 4112 * This function has been moved to ucnv2022.c for inlining. 4113 * This implementation is here only for documentation purposes 4114 */ 4115 4116/** 4117 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4118 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4119 * It does not handle conversion extensions (_extFromU()). 4120 * 4121 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4122 */ 4123U_CFUNC int32_t 4124ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4125 UChar32 c, 4126 UBool useFallback) { 4127 const uint16_t *table; 4128 int32_t value; 4129 4130 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4131 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4132 return -1; 4133 } 4134 4135 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4136 table=sharedData->mbcs.fromUnicodeTable; 4137 4138 /* get the byte for the output */ 4139 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4140 /* is this code point assigned, or do we use fallbacks? */ 4141 if(useFallback ? value>=0x800 : value>=0xc00) { 4142 return value&0xff; 4143 } else { 4144 return -1; 4145 } 4146} 4147#endif 4148 4149/* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4150 4151/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4152static const UChar32 4153utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4154 4155/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4156static const UChar32 4157utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4158 4159static void 4160ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4161 UConverterToUnicodeArgs *pToUArgs, 4162 UErrorCode *pErrorCode) { 4163 UConverter *utf8, *cnv; 4164 const uint8_t *source, *sourceLimit; 4165 uint8_t *target; 4166 int32_t targetCapacity; 4167 4168 const uint16_t *table, *sbcsIndex; 4169 const uint16_t *results; 4170 4171 int8_t oldToULength, toULength, toULimit; 4172 4173 UChar32 c; 4174 uint8_t b, t1, t2; 4175 4176 uint32_t asciiRoundtrips; 4177 uint16_t value, minValue; 4178 UBool hasSupplementary; 4179 4180 /* set up the local pointers */ 4181 utf8=pToUArgs->converter; 4182 cnv=pFromUArgs->converter; 4183 source=(uint8_t *)pToUArgs->source; 4184 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4185 target=(uint8_t *)pFromUArgs->target; 4186 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4187 4188 table=cnv->sharedData->mbcs.fromUnicodeTable; 4189 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 4190 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4191 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4192 } else { 4193 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4194 } 4195 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4196 4197 if(cnv->useFallback) { 4198 /* use all roundtrip and fallback results */ 4199 minValue=0x800; 4200 } else { 4201 /* use only roundtrips and fallbacks from private-use characters */ 4202 minValue=0xc00; 4203 } 4204 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4205 4206 /* get the converter state from the UTF-8 UConverter */ 4207 c=(UChar32)utf8->toUnicodeStatus; 4208 if(c!=0) { 4209 toULength=oldToULength=utf8->toULength; 4210 toULimit=(int8_t)utf8->mode; 4211 } else { 4212 toULength=oldToULength=toULimit=0; 4213 } 4214 4215 /* 4216 * Make sure that the last byte sequence before sourceLimit is complete 4217 * or runs into a lead byte. 4218 * Do not go back into the bytes that will be read for finishing a partial 4219 * sequence from the previous buffer. 4220 * In the conversion loop compare source with sourceLimit only once 4221 * per multi-byte character. 4222 */ 4223 { 4224 int32_t i, length; 4225 4226 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4227 for(i=0; i<3 && i<length;) { 4228 b=*(sourceLimit-i-1); 4229 if(U8_IS_TRAIL(b)) { 4230 ++i; 4231 } else { 4232 if(i<utf8_countTrailBytes[b]) { 4233 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4234 sourceLimit-=i+1; 4235 } 4236 break; 4237 } 4238 } 4239 } 4240 4241 if(c!=0 && targetCapacity>0) { 4242 utf8->toUnicodeStatus=0; 4243 utf8->toULength=0; 4244 goto moreBytes; 4245 /* 4246 * Note: We could avoid the goto by duplicating some of the moreBytes 4247 * code, but only up to the point of collecting a complete UTF-8 4248 * sequence; then recurse for the toUBytes[toULength] 4249 * and then continue with normal conversion. 4250 * 4251 * If so, move this code to just after initializing the minimum 4252 * set of local variables for reading the UTF-8 input 4253 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 4254 * 4255 * Potential advantages: 4256 * - avoid the goto 4257 * - oldToULength could become a local variable in just those code blocks 4258 * that deal with buffer boundaries 4259 * - possibly faster if the goto prevents some compiler optimizations 4260 * (this would need measuring to confirm) 4261 * Disadvantage: 4262 * - code duplication 4263 */ 4264 } 4265 4266 /* conversion loop */ 4267 while(source<sourceLimit) { 4268 if(targetCapacity>0) { 4269 b=*source++; 4270 if((int8_t)b>=0) { 4271 /* convert ASCII */ 4272 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4273 *target++=(uint8_t)b; 4274 --targetCapacity; 4275 continue; 4276 } else { 4277 c=b; 4278 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 4279 } 4280 } else { 4281 if(b<0xe0) { 4282 if( /* handle U+0080..U+07FF inline */ 4283 b>=0xc2 && 4284 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4285 ) { 4286 c=b&0x1f; 4287 ++source; 4288 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 4289 if(value>=minValue) { 4290 *target++=(uint8_t)value; 4291 --targetCapacity; 4292 continue; 4293 } else { 4294 c=(c<<6)|t1; 4295 } 4296 } else { 4297 c=-1; 4298 } 4299 } else if(b==0xe0) { 4300 if( /* handle U+0800..U+0FFF inline */ 4301 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 4302 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 4303 ) { 4304 c=t1; 4305 source+=2; 4306 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 4307 if(value>=minValue) { 4308 *target++=(uint8_t)value; 4309 --targetCapacity; 4310 continue; 4311 } else { 4312 c=(c<<6)|t2; 4313 } 4314 } else { 4315 c=-1; 4316 } 4317 } else { 4318 c=-1; 4319 } 4320 4321 if(c<0) { 4322 /* handle "complicated" and error cases, and continuing partial characters */ 4323 oldToULength=0; 4324 toULength=1; 4325 toULimit=utf8_countTrailBytes[b]+1; 4326 c=b; 4327moreBytes: 4328 while(toULength<toULimit) { 4329 if(source<sourceLimit) { 4330 b=*source; 4331 if(U8_IS_TRAIL(b)) { 4332 ++source; 4333 ++toULength; 4334 c=(c<<6)+b; 4335 } else { 4336 break; /* sequence too short, stop with toULength<toULimit */ 4337 } 4338 } else { 4339 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 4340 source-=(toULength-oldToULength); 4341 while(oldToULength<toULength) { 4342 utf8->toUBytes[oldToULength++]=*source++; 4343 } 4344 utf8->toUnicodeStatus=c; 4345 utf8->toULength=toULength; 4346 utf8->mode=toULimit; 4347 pToUArgs->source=(char *)source; 4348 pFromUArgs->target=(char *)target; 4349 return; 4350 } 4351 } 4352 4353 if( toULength==toULimit && /* consumed all trail bytes */ 4354 (toULength==3 || toULength==2) && /* BMP */ 4355 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 4356 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 4357 ) { 4358 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4359 } else if( 4360 toULength==toULimit && toULength==4 && 4361 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 4362 ) { 4363 /* supplementary code point */ 4364 if(!hasSupplementary) { 4365 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4366 value=0; 4367 } else { 4368 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4369 } 4370 } else { 4371 /* error handling: illegal UTF-8 byte sequence */ 4372 source-=(toULength-oldToULength); 4373 while(oldToULength<toULength) { 4374 utf8->toUBytes[oldToULength++]=*source++; 4375 } 4376 utf8->toULength=toULength; 4377 pToUArgs->source=(char *)source; 4378 pFromUArgs->target=(char *)target; 4379 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4380 return; 4381 } 4382 } 4383 } 4384 4385 if(value>=minValue) { 4386 /* output the mapping for c */ 4387 *target++=(uint8_t)value; 4388 --targetCapacity; 4389 } else { 4390 /* value<minValue means c is unassigned (unmappable) */ 4391 /* 4392 * Try an extension mapping. 4393 * Pass in no source because we don't have UTF-16 input. 4394 * If we have a partial match on c, we will return and revert 4395 * to UTF-8->UTF-16->charset conversion. 4396 */ 4397 static const UChar nul=0; 4398 const UChar *noSource=&nul; 4399 c=_extFromU(cnv, cnv->sharedData, 4400 c, &noSource, noSource, 4401 &target, target+targetCapacity, 4402 NULL, -1, 4403 pFromUArgs->flush, 4404 pErrorCode); 4405 4406 if(U_FAILURE(*pErrorCode)) { 4407 /* not mappable or buffer overflow */ 4408 cnv->fromUChar32=c; 4409 break; 4410 } else if(cnv->preFromUFirstCP>=0) { 4411 /* 4412 * Partial match, return and revert to pivoting. 4413 * In normal from-UTF-16 conversion, we would just continue 4414 * but then exit the loop because the extension match would 4415 * have consumed the source. 4416 */ 4417 break; 4418 } else { 4419 /* a mapping was written to the target, continue */ 4420 4421 /* recalculate the targetCapacity after an extension mapping */ 4422 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 4423 } 4424 } 4425 } else { 4426 /* target is full */ 4427 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4428 break; 4429 } 4430 } 4431 4432 /* 4433 * The sourceLimit may have been adjusted before the conversion loop 4434 * to stop before a truncated sequence. 4435 * If so, then collect the truncated sequence now. 4436 */ 4437 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 4438 c=utf8->toUBytes[0]=b=*source++; 4439 toULength=1; 4440 toULimit=utf8_countTrailBytes[b]+1; 4441 while(source<sourceLimit) { 4442 utf8->toUBytes[toULength++]=b=*source++; 4443 c=(c<<6)+b; 4444 } 4445 utf8->toUnicodeStatus=c; 4446 utf8->toULength=toULength; 4447 utf8->mode=toULimit; 4448 } 4449 4450 /* write back the updated pointers */ 4451 pToUArgs->source=(char *)source; 4452 pFromUArgs->target=(char *)target; 4453} 4454 4455static void 4456ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4457 UConverterToUnicodeArgs *pToUArgs, 4458 UErrorCode *pErrorCode) { 4459 UConverter *utf8, *cnv; 4460 const uint8_t *source, *sourceLimit; 4461 uint8_t *target; 4462 int32_t targetCapacity; 4463 4464 const uint16_t *table, *mbcsIndex; 4465 const uint16_t *results; 4466 4467 int8_t oldToULength, toULength, toULimit; 4468 4469 UChar32 c; 4470 uint8_t b, t1, t2; 4471 4472 uint32_t stage2Entry; 4473 uint32_t asciiRoundtrips; 4474 uint16_t value, minValue; 4475 UBool hasSupplementary; 4476 4477 /* set up the local pointers */ 4478 utf8=pToUArgs->converter; 4479 cnv=pFromUArgs->converter; 4480 source=(uint8_t *)pToUArgs->source; 4481 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4482 target=(uint8_t *)pFromUArgs->target; 4483 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4484 4485 table=cnv->sharedData->mbcs.fromUnicodeTable; 4486 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 4487 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4488 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4489 } else { 4490 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4491 } 4492 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4493 4494 if(cnv->useFallback) { 4495 /* use all roundtrip and fallback results */ 4496 minValue=0x800; 4497 } else { 4498 /* use only roundtrips and fallbacks from private-use characters */ 4499 minValue=0xc00; 4500 } 4501 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4502 4503 /* get the converter state from the UTF-8 UConverter */ 4504 c=(UChar32)utf8->toUnicodeStatus; 4505 if(c!=0) { 4506 toULength=oldToULength=utf8->toULength; 4507 toULimit=(int8_t)utf8->mode; 4508 } else { 4509 toULength=oldToULength=toULimit=0; 4510 } 4511 4512 /* 4513 * Make sure that the last byte sequence before sourceLimit is complete 4514 * or runs into a lead byte. 4515 * Do not go back into the bytes that will be read for finishing a partial 4516 * sequence from the previous buffer. 4517 * In the conversion loop compare source with sourceLimit only once 4518 * per multi-byte character. 4519 */ 4520 { 4521 int32_t i, length; 4522 4523 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4524 for(i=0; i<3 && i<length;) { 4525 b=*(sourceLimit-i-1); 4526 if(U8_IS_TRAIL(b)) { 4527 ++i; 4528 } else { 4529 if(i<utf8_countTrailBytes[b]) { 4530 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4531 sourceLimit-=i+1; 4532 } 4533 break; 4534 } 4535 } 4536 } 4537 4538 if(c!=0 && targetCapacity>0) { 4539 utf8->toUnicodeStatus=0; 4540 utf8->toULength=0; 4541 goto moreBytes; 4542 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 4543 } 4544 4545 /* conversion loop */ 4546 while(source<sourceLimit) { 4547 if(targetCapacity>0) { 4548 b=*source++; 4549 if((int8_t)b>=0) { 4550 /* convert ASCII */ 4551 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4552 *target++=b; 4553 --targetCapacity; 4554 continue; 4555 } else { 4556 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 4557 if(value==0) { 4558 c=b; 4559 goto unassigned; 4560 } 4561 } 4562 } else { 4563 if(b>0xe0) { 4564 if( /* handle U+1000..U+D7FF inline */ 4565 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 4566 (b==0xed && (t1 <= 0x1f))) && 4567 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 4568 ) { 4569 c=((b&0xf)<<6)|t1; 4570 source+=2; 4571 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 4572 if(value==0) { 4573 c=(c<<6)|t2; 4574 goto unassigned; 4575 } 4576 } else { 4577 c=-1; 4578 } 4579 } else if(b<0xe0) { 4580 if( /* handle U+0080..U+07FF inline */ 4581 b>=0xc2 && 4582 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4583 ) { 4584 c=b&0x1f; 4585 ++source; 4586 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 4587 if(value==0) { 4588 c=(c<<6)|t1; 4589 goto unassigned; 4590 } 4591 } else { 4592 c=-1; 4593 } 4594 } else { 4595 c=-1; 4596 } 4597 4598 if(c<0) { 4599 /* handle "complicated" and error cases, and continuing partial characters */ 4600 oldToULength=0; 4601 toULength=1; 4602 toULimit=utf8_countTrailBytes[b]+1; 4603 c=b; 4604moreBytes: 4605 while(toULength<toULimit) { 4606 if(source<sourceLimit) { 4607 b=*source; 4608 if(U8_IS_TRAIL(b)) { 4609 ++source; 4610 ++toULength; 4611 c=(c<<6)+b; 4612 } else { 4613 break; /* sequence too short, stop with toULength<toULimit */ 4614 } 4615 } else { 4616 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 4617 source-=(toULength-oldToULength); 4618 while(oldToULength<toULength) { 4619 utf8->toUBytes[oldToULength++]=*source++; 4620 } 4621 utf8->toUnicodeStatus=c; 4622 utf8->toULength=toULength; 4623 utf8->mode=toULimit; 4624 pToUArgs->source=(char *)source; 4625 pFromUArgs->target=(char *)target; 4626 return; 4627 } 4628 } 4629 4630 if( toULength==toULimit && /* consumed all trail bytes */ 4631 (toULength==3 || toULength==2) && /* BMP */ 4632 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 4633 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 4634 ) { 4635 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4636 } else if( 4637 toULength==toULimit && toULength==4 && 4638 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 4639 ) { 4640 /* supplementary code point */ 4641 if(!hasSupplementary) { 4642 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4643 stage2Entry=0; 4644 } else { 4645 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4646 } 4647 } else { 4648 /* error handling: illegal UTF-8 byte sequence */ 4649 source-=(toULength-oldToULength); 4650 while(oldToULength<toULength) { 4651 utf8->toUBytes[oldToULength++]=*source++; 4652 } 4653 utf8->toULength=toULength; 4654 pToUArgs->source=(char *)source; 4655 pFromUArgs->target=(char *)target; 4656 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4657 return; 4658 } 4659 4660 /* get the bytes and the length for the output */ 4661 /* MBCS_OUTPUT_2 */ 4662 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 4663 4664 /* is this code point assigned, or do we use fallbacks? */ 4665 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4666 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4667 ) { 4668 goto unassigned; 4669 } 4670 } 4671 } 4672 4673 /* write the output character bytes from value and length */ 4674 /* from the first if in the loop we know that targetCapacity>0 */ 4675 if(value<=0xff) { 4676 /* this is easy because we know that there is enough space */ 4677 *target++=(uint8_t)value; 4678 --targetCapacity; 4679 } else /* length==2 */ { 4680 *target++=(uint8_t)(value>>8); 4681 if(2<=targetCapacity) { 4682 *target++=(uint8_t)value; 4683 targetCapacity-=2; 4684 } else { 4685 cnv->charErrorBuffer[0]=(char)value; 4686 cnv->charErrorBufferLength=1; 4687 4688 /* target overflow */ 4689 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4690 break; 4691 } 4692 } 4693 continue; 4694 4695unassigned: 4696 { 4697 /* 4698 * Try an extension mapping. 4699 * Pass in no source because we don't have UTF-16 input. 4700 * If we have a partial match on c, we will return and revert 4701 * to UTF-8->UTF-16->charset conversion. 4702 */ 4703 static const UChar nul=0; 4704 const UChar *noSource=&nul; 4705 c=_extFromU(cnv, cnv->sharedData, 4706 c, &noSource, noSource, 4707 &target, target+targetCapacity, 4708 NULL, -1, 4709 pFromUArgs->flush, 4710 pErrorCode); 4711 4712 if(U_FAILURE(*pErrorCode)) { 4713 /* not mappable or buffer overflow */ 4714 cnv->fromUChar32=c; 4715 break; 4716 } else if(cnv->preFromUFirstCP>=0) { 4717 /* 4718 * Partial match, return and revert to pivoting. 4719 * In normal from-UTF-16 conversion, we would just continue 4720 * but then exit the loop because the extension match would 4721 * have consumed the source. 4722 */ 4723 break; 4724 } else { 4725 /* a mapping was written to the target, continue */ 4726 4727 /* recalculate the targetCapacity after an extension mapping */ 4728 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 4729 continue; 4730 } 4731 } 4732 } else { 4733 /* target is full */ 4734 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4735 break; 4736 } 4737 } 4738 4739 /* 4740 * The sourceLimit may have been adjusted before the conversion loop 4741 * to stop before a truncated sequence. 4742 * If so, then collect the truncated sequence now. 4743 */ 4744 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 4745 c=utf8->toUBytes[0]=b=*source++; 4746 toULength=1; 4747 toULimit=utf8_countTrailBytes[b]+1; 4748 while(source<sourceLimit) { 4749 utf8->toUBytes[toULength++]=b=*source++; 4750 c=(c<<6)+b; 4751 } 4752 utf8->toUnicodeStatus=c; 4753 utf8->toULength=toULength; 4754 utf8->mode=toULimit; 4755 } 4756 4757 /* write back the updated pointers */ 4758 pToUArgs->source=(char *)source; 4759 pFromUArgs->target=(char *)target; 4760} 4761 4762/* miscellaneous ------------------------------------------------------------ */ 4763 4764static void 4765ucnv_MBCSGetStarters(const UConverter* cnv, 4766 UBool starters[256], 4767 UErrorCode *pErrorCode) { 4768 const int32_t *state0; 4769 int i; 4770 4771 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 4772 for(i=0; i<256; ++i) { 4773 /* all bytes that cause a state transition from state 0 are lead bytes */ 4774 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 4775 } 4776} 4777 4778/* 4779 * This is an internal function that allows other converter implementations 4780 * to check whether a byte is a lead byte. 4781 */ 4782U_CFUNC UBool 4783ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 4784 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 4785} 4786 4787static void 4788ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 4789 int32_t offsetIndex, 4790 UErrorCode *pErrorCode) { 4791 UConverter *cnv=pArgs->converter; 4792 char *p, *subchar; 4793 char buffer[4]; 4794 int32_t length; 4795 4796 /* first, select between subChar and subChar1 */ 4797 if( cnv->subChar1!=0 && 4798 (cnv->sharedData->mbcs.extIndexes!=NULL ? 4799 cnv->useSubChar1 : 4800 (cnv->invalidUCharBuffer[0]<=0xff)) 4801 ) { 4802 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 4803 subchar=(char *)&cnv->subChar1; 4804 length=1; 4805 } else { 4806 /* select subChar in all other cases */ 4807 subchar=(char *)cnv->subChars; 4808 length=cnv->subCharLen; 4809 } 4810 4811 /* reset the selector for the next code point */ 4812 cnv->useSubChar1=FALSE; 4813 4814 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 4815 p=buffer; 4816 4817 /* fromUnicodeStatus contains prevLength */ 4818 switch(length) { 4819 case 1: 4820 if(cnv->fromUnicodeStatus==2) { 4821 /* DBCS mode and SBCS sub char: change to SBCS */ 4822 cnv->fromUnicodeStatus=1; 4823 *p++=UCNV_SI; 4824 } 4825 *p++=subchar[0]; 4826 break; 4827 case 2: 4828 if(cnv->fromUnicodeStatus<=1) { 4829 /* SBCS mode and DBCS sub char: change to DBCS */ 4830 cnv->fromUnicodeStatus=2; 4831 *p++=UCNV_SO; 4832 } 4833 *p++=subchar[0]; 4834 *p++=subchar[1]; 4835 break; 4836 default: 4837 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 4838 return; 4839 } 4840 subchar=buffer; 4841 length=(int32_t)(p-buffer); 4842 } 4843 4844 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 4845} 4846 4847U_CFUNC UConverterType 4848ucnv_MBCSGetType(const UConverter* converter) { 4849 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 4850 if(converter->sharedData->mbcs.countStates==1) { 4851 return (UConverterType)UCNV_SBCS; 4852 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 4853 return (UConverterType)UCNV_EBCDIC_STATEFUL; 4854 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 4855 return (UConverterType)UCNV_DBCS; 4856 } 4857 return (UConverterType)UCNV_MBCS; 4858} 4859 4860static const UConverterImpl _SBCSUTF8Impl={ 4861 UCNV_MBCS, 4862 4863 ucnv_MBCSLoad, 4864 ucnv_MBCSUnload, 4865 4866 ucnv_MBCSOpen, 4867 NULL, 4868 NULL, 4869 4870 ucnv_MBCSToUnicodeWithOffsets, 4871 ucnv_MBCSToUnicodeWithOffsets, 4872 ucnv_MBCSFromUnicodeWithOffsets, 4873 ucnv_MBCSFromUnicodeWithOffsets, 4874 ucnv_MBCSGetNextUChar, 4875 4876 ucnv_MBCSGetStarters, 4877 ucnv_MBCSGetName, 4878 ucnv_MBCSWriteSub, 4879 NULL, 4880 ucnv_MBCSGetUnicodeSet, 4881 4882 NULL, 4883 ucnv_SBCSFromUTF8 4884}; 4885 4886static const UConverterImpl _DBCSUTF8Impl={ 4887 UCNV_MBCS, 4888 4889 ucnv_MBCSLoad, 4890 ucnv_MBCSUnload, 4891 4892 ucnv_MBCSOpen, 4893 NULL, 4894 NULL, 4895 4896 ucnv_MBCSToUnicodeWithOffsets, 4897 ucnv_MBCSToUnicodeWithOffsets, 4898 ucnv_MBCSFromUnicodeWithOffsets, 4899 ucnv_MBCSFromUnicodeWithOffsets, 4900 ucnv_MBCSGetNextUChar, 4901 4902 ucnv_MBCSGetStarters, 4903 ucnv_MBCSGetName, 4904 ucnv_MBCSWriteSub, 4905 NULL, 4906 ucnv_MBCSGetUnicodeSet, 4907 4908 NULL, 4909 ucnv_DBCSFromUTF8 4910}; 4911 4912static const UConverterImpl _MBCSImpl={ 4913 UCNV_MBCS, 4914 4915 ucnv_MBCSLoad, 4916 ucnv_MBCSUnload, 4917 4918 ucnv_MBCSOpen, 4919 NULL, 4920 NULL, 4921 4922 ucnv_MBCSToUnicodeWithOffsets, 4923 ucnv_MBCSToUnicodeWithOffsets, 4924 ucnv_MBCSFromUnicodeWithOffsets, 4925 ucnv_MBCSFromUnicodeWithOffsets, 4926 ucnv_MBCSGetNextUChar, 4927 4928 ucnv_MBCSGetStarters, 4929 ucnv_MBCSGetName, 4930 ucnv_MBCSWriteSub, 4931 NULL, 4932 ucnv_MBCSGetUnicodeSet 4933}; 4934 4935 4936/* Static data is in tools/makeconv/ucnvstat.c for data-based 4937 * converters. Be sure to update it as well. 4938 */ 4939 4940const UConverterSharedData _MBCSData={ 4941 sizeof(UConverterSharedData), 1, 4942 NULL, NULL, NULL, FALSE, &_MBCSImpl, 4943 0 4944}; 4945 4946#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 4947