1/* 2****************************************************************************** 3* 4* Copyright (C) 2000-2009, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* file name: ucnvmbcs.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2000jul03 14* created by: Markus W. Scherer 15* 16* The current code in this file replaces the previous implementation 17* of conversion code from multi-byte codepages to Unicode and back. 18* This implementation supports the following: 19* - legacy variable-length codepages with up to 4 bytes per character 20* - all Unicode code points (up to 0x10ffff) 21* - efficient distinction of unassigned vs. illegal byte sequences 22* - it is possible in fromUnicode() to directly deal with simple 23* stateful encodings (used for EBCDIC_STATEFUL) 24* - it is possible to convert Unicode code points 25* to a single zero byte (but not as a fallback except for SBCS) 26* 27* Remaining limitations in fromUnicode: 28* - byte sequences must not have leading zero bytes 29* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30* - limitation to up to 4 bytes per character 31* 32* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33* limitations and adds m:n character mappings and other features. 34* See ucnv_ext.h for details. 35* 36* Change history: 37* 38* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40* macros to ucnvmbcs.h file 41*/ 42 43#include "unicode/utypes.h" 44 45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47#include "unicode/ucnv.h" 48#include "unicode/ucnv_cb.h" 49#include "unicode/udata.h" 50#include "unicode/uset.h" 51#include "ucnv_bld.h" 52#include "ucnvmbcs.h" 53#include "ucnv_ext.h" 54#include "ucnv_cnv.h" 55#include "umutex.h" 56#include "cmemory.h" 57#include "cstring.h" 58 59/* control optimizations according to the platform */ 60#define MBCS_UNROLL_SINGLE_TO_BMP 1 61#define MBCS_UNROLL_SINGLE_FROM_BMP 0 62 63/* 64 * _MBCSHeader versions 5.3 & 4.3 65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 66 * 67 * This version is optional. Version 5 is used for incompatible data format changes. 68 * makeconv will continue to generate version 4 files if possible. 69 * 70 * Changes from version 4: 71 * 72 * The main difference is an additional _MBCSHeader field with 73 * - the length (number of uint32_t) of the _MBCSHeader 74 * - flags for further incompatible data format changes 75 * - flags for further, backward compatible data format changes 76 * 77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 78 * the file and needs to be reconstituted at load time. 79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 81 * (For details about these structures see below, and see ucnvmbcs.h.) 82 * 83 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 84 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 85 * precision markers for all mappings.) 86 * 87 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 88 * omitted data that can be reconstituted from the toUnicode data. 89 * 90 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 91 * With only roundtrip mappings in the base fromUnicode data, this part is fully 92 * redundant with the mbcsIndex and will be reconstituted from that (also using the 93 * stage 1 table which contains the information about how stage 2 was compacted). 94 * 95 * The rest of the stage 2 table, the part for code points above maxFastUChar, 96 * is stored in the file and will be appended to the reconstituted part. 97 * 98 * The entire fromUBytes array is omitted from the file and will be reconstitued. 99 * This is done by enumerating all toUnicode roundtrip mappings, performing 100 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 101 * writing instead of reading the byte values. 102 * 103 * _MBCSHeader version 4.3 104 * 105 * Change from version 4.2: 106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 107 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 108 * files which can be used instead of stages 1 & 2. 109 * Faster lookups for roundtrips from most commonly used characters, 110 * and lookups from UTF-8 byte sequences with a natural bit distribution. 111 * See ucnvmbcs.h for more details. 112 * 113 * Change from version 4.1: 114 * - Added an optional extension table structure at the end of the .cnv file. 115 * It is present if the upper bits of the header flags field contains a non-zero 116 * byte offset to it. 117 * Files that contain only a conversion table and no base table 118 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 119 * These contain the base table name between the MBCS header and the extension 120 * data. 121 * 122 * Change from version 4.0: 123 * - Replace header.reserved with header.fromUBytesLength so that all 124 * fields in the data have length. 125 * 126 * Changes from version 3 (for performance improvements): 127 * - new bit distribution for state table entries 128 * - reordered action codes 129 * - new data structure for single-byte fromUnicode 130 * + stage 2 only contains indexes 131 * + stage 3 stores 16 bits per character with classification bits 15..8 132 * - no multiplier for stage 1 entries 133 * - stage 2 for non-single-byte codepages contains the index and the flags in 134 * one 32-bit value 135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 136 * 137 * For more details about old versions of the MBCS data structure, see 138 * the corresponding versions of this file. 139 * 140 * Converting stateless codepage data ---------------------------------------*** 141 * (or codepage data with simple states) to Unicode. 142 * 143 * Data structure and algorithm for converting from complex legacy codepages 144 * to Unicode. (Designed before 2000-may-22.) 145 * 146 * The basic idea is that the structure of legacy codepages can be described 147 * with state tables. 148 * When reading a byte stream, each input byte causes a state transition. 149 * Some transitions result in the output of a code point, some result in 150 * "unassigned" or "illegal" output. 151 * This is used here for character conversion. 152 * 153 * The data structure begins with a state table consisting of a row 154 * per state, with 256 entries (columns) per row for each possible input 155 * byte value. 156 * Each entry is 32 bits wide, with two formats distinguished by 157 * the sign bit (bit 31): 158 * 159 * One format for transitional entries (bit 31 not set) for non-final bytes, and 160 * one format for final entries (bit 31 set). 161 * Both formats contain the number of the next state in the same bit 162 * positions. 163 * State 0 is the initial state. 164 * 165 * Most of the time, the offset values of subsequent states are added 166 * up to a scalar value. This value will eventually be the index of 167 * the Unicode code point in a table that follows the state table. 168 * The effect is that the code points for final state table rows 169 * are contiguous. The code points of final state rows follow each other 170 * in the order of the references to those final states by previous 171 * states, etc. 172 * 173 * For some terminal states, the offset is itself the output Unicode 174 * code point (16 bits for a BMP code point or 20 bits for a supplementary 175 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 176 * For others, the code point in the Unicode table is stored with either 177 * one or two code units: one for BMP code points, two for a pair of 178 * surrogates. 179 * All code points for a final state entry take up the same number of code 180 * units, regardless of whether they all actually _use_ the same number 181 * of code units. This is necessary for simple array access. 182 * 183 * An additional feature comes in with what in ICU is called "fallback" 184 * mappings: 185 * 186 * In addition to round-trippable, precise, 1:1 mappings, there are often 187 * mappings defined between similar, though not the same, characters. 188 * Typically, such mappings occur only in fromUnicode mapping tables because 189 * Unicode has a superset repertoire of most other codepages. However, it 190 * is possible to provide such mappings in the toUnicode tables, too. 191 * In this case, the fallback mappings are partly integrated into the 192 * general state tables because the structure of the encoding includes their 193 * byte sequences. 194 * For final entries in an initial state, fallback mappings are stored in 195 * the entry itself like with roundtrip mappings. 196 * For other final entries, they are stored in the code units table if 197 * the entry is for a pair of code units. 198 * For single-unit results in the code units table, there is no space to 199 * alternatively hold a fallback mapping; in this case, the code unit 200 * is stored as U+fffe (unassigned), and the fallback mapping needs to 201 * be looked up by the scalar offset value in a separate table. 202 * 203 * "Unassigned" state entries really mean "structurally unassigned", 204 * i.e., such a byte sequence will never have a mapping result. 205 * 206 * The interpretation of the bits in each entry is as follows: 207 * 208 * Bit 31 not set, not a terminal entry ("transitional"): 209 * 30..24 next state 210 * 23..0 offset delta, to be added up 211 * 212 * Bit 31 set, terminal ("final") entry: 213 * 30..24 next state (regardless of action code) 214 * 23..20 action code: 215 * action codes 0 and 1 result in precise-mapping Unicode code points 216 * 0 valid byte sequence 217 * 19..16 not used, 0 218 * 15..0 16-bit Unicode BMP code point 219 * never U+fffe or U+ffff 220 * 1 valid byte sequence 221 * 19..0 20-bit Unicode supplementary code point 222 * never U+fffe or U+ffff 223 * 224 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 225 * 2 valid byte sequence (fallback) 226 * 19..16 not used, 0 227 * 15..0 16-bit Unicode BMP code point as fallback result 228 * 3 valid byte sequence (fallback) 229 * 19..0 20-bit Unicode supplementary code point as fallback result 230 * 231 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 232 * depending on the code units they result in 233 * 4 valid byte sequence 234 * 19..9 not used, 0 235 * 8..0 final offset delta 236 * pointing to one 16-bit code unit which may be 237 * fffe unassigned -- look for a fallback for this offset 238 * ffff illegal 239 * 5 valid byte sequence 240 * 19..9 not used, 0 241 * 8..0 final offset delta 242 * pointing to two 16-bit code units 243 * (typically UTF-16 surrogates) 244 * the result depends on the first code unit as follows: 245 * 0000..d7ff roundtrip BMP code point (1st alone) 246 * d800..dbff roundtrip surrogate pair (1st, 2nd) 247 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 248 * e000 roundtrip BMP code point (2nd alone) 249 * e001 fallback BMP code point (2nd alone) 250 * fffe unassigned 251 * ffff illegal 252 * (the final offset deltas are at most 255 * 2, 253 * times 2 because of storing code unit pairs) 254 * 255 * 6 unassigned byte sequence 256 * 19..16 not used, 0 257 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 258 * this does not contain a final offset delta because the main 259 * purpose of this action code is to save scalar offset values; 260 * therefore, fallback values cannot be assigned to byte 261 * sequences that result in this action code 262 * 7 illegal byte sequence 263 * 19..16 not used, 0 264 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 265 * 8 state change only 266 * 19..0 not used, 0 267 * useful for state changes in simple stateful encodings, 268 * at Shift-In/Shift-Out codes 269 * 270 * 271 * 9..15 reserved for future use 272 * current implementations will only perform a state change 273 * and ignore bits 19..0 274 * 275 * An encoding with contiguous ranges of unassigned byte sequences, like 276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 277 * at least two states for the trail bytes: 278 * One trail byte state that results in code points, and one that only 279 * has "unassigned" and "illegal" terminal states. 280 * 281 * Note: partly by accident, this data structure supports simple stateful 282 * encodings without any additional logic. 283 * Currently, only simple Shift-In/Shift-Out schemes are handled with 284 * appropriate state tables (especially EBCDIC_STATEFUL!). 285 * 286 * MBCS version 2 added: 287 * unassigned and illegal action codes have U+fffe and U+ffff 288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 289 * 290 * Converting from Unicode to codepage bytes --------------------------------*** 291 * 292 * The conversion data structure for fromUnicode is designed for the known 293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 295 * a roundtrip mapping. 296 * 297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 298 * like in the character properties table. 299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 300 * with the resulting bytes is at offsetFromUBytes. 301 * 302 * Beginning with version 4, single-byte codepages have a significantly different 303 * trie compared to other codepages. 304 * In all cases, the entry in stage 1 is directly the index of the block of 305 * 64 entries in stage 2. 306 * 307 * Single-byte lookup: 308 * 309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 310 * Stage 3 contains one 16-bit word per result: 311 * Bits 15..8 indicate the kind of result: 312 * f roundtrip result 313 * c fallback result from private-use code point 314 * 8 fallback result from other code points 315 * 0 unassigned 316 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 317 * 318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 321 * ASCII code points can be looked up with a linear array access into stage 3. 322 * See maxFastUChar and other details in ucnvmbcs.h. 323 * 324 * Multi-byte lookup: 325 * 326 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 328 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 329 * If this test is false, then a non-zero result will be interpreted as 330 * a fallback mapping. 331 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 332 * 333 * Stage 3 contains 2, 3, or 4 bytes per result. 334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 335 * while 3 bytes are stored as bytes in big-endian order. 336 * Leading zero bytes are ignored, and the number of bytes is counted. 337 * A zero byte mapping result is possible as a roundtrip result. 338 * For some output types, the actual result is processed from this; 339 * see ucnv_MBCSFromUnicodeWithOffsets(). 340 * 341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 343 * 344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 347 * ASCII code points can be looked up with a linear array access into stage 3. 348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 349 * 350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 351 * for compaction. 352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 353 * may overlap by any number of entries. 354 * 355 * MBCS version 2 added: 356 * the converter checks for known output types, which allows 357 * adding new ones without crashing an unaware converter 358 */ 359 360static const UConverterImpl _SBCSUTF8Impl; 361static const UConverterImpl _DBCSUTF8Impl; 362 363/* GB 18030 data ------------------------------------------------------------ */ 364 365/* helper macros for linear values for GB 18030 four-byte sequences */ 366#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 367 368#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 369 370#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 371 372/* 373 * Some ranges of GB 18030 where both the Unicode code points and the 374 * GB four-byte sequences are contiguous and are handled algorithmically by 375 * the special callback functions below. 376 * The values are start & end of Unicode & GB codes. 377 * 378 * Note that single surrogates are not mapped by GB 18030 379 * as of the re-released mapping tables from 2000-nov-30. 380 */ 381static const uint32_t 382gb18030Ranges[13][4]={ 383 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 384 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 385 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)}, 386 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 387 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 388 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 389 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 390 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 391 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 392 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 393 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 394 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 395 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 396}; 397 398/* bit flag for UConverter.options indicating GB 18030 special handling */ 399#define _MBCS_OPTION_GB18030 0x8000 400 401/* Miscellaneous ------------------------------------------------------------ */ 402 403/** 404 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 405 * consecutive sequences of bytes, starting from the one encoded in value, 406 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 407 * Does not currently support m:n mappings or reverse fallbacks. 408 * This function will not be called for sequences of bytes with leading zeros. 409 * 410 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 411 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 412 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 413 * not map to anything 414 * @return TRUE to continue enumeration, FALSE to stop 415 */ 416typedef UBool U_CALLCONV 417UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 418 419/* similar to ucnv_MBCSGetNextUChar() but recursive */ 420static UBool 421enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 422 int32_t state, uint32_t offset, 423 uint32_t value, 424 UConverterEnumToUCallback *callback, const void *context, 425 UErrorCode *pErrorCode) { 426 UChar32 codePoints[32]; 427 const int32_t *row; 428 const uint16_t *unicodeCodeUnits; 429 UChar32 anyCodePoints; 430 int32_t b, limit; 431 432 row=mbcsTable->stateTable[state]; 433 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 434 435 value<<=8; 436 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 437 438 b=(stateProps[state]&0x38)<<2; 439 if(b==0 && stateProps[state]>=0x40) { 440 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 441 codePoints[0]=U_SENTINEL; 442 b=1; 443 } 444 limit=((stateProps[state]&7)+1)<<5; 445 while(b<limit) { 446 int32_t entry=row[b]; 447 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 448 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 449 if(stateProps[nextState]>=0) { 450 /* recurse to a state with non-ignorable actions */ 451 if(!enumToU( 452 mbcsTable, stateProps, nextState, 453 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 454 value|(uint32_t)b, 455 callback, context, 456 pErrorCode)) { 457 return FALSE; 458 } 459 } 460 codePoints[b&0x1f]=U_SENTINEL; 461 } else { 462 UChar32 c; 463 int32_t action; 464 465 /* 466 * An if-else-if chain provides more reliable performance for 467 * the most common cases compared to a switch. 468 */ 469 action=MBCS_ENTRY_FINAL_ACTION(entry); 470 if(action==MBCS_STATE_VALID_DIRECT_16) { 471 /* output BMP code point */ 472 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 473 } else if(action==MBCS_STATE_VALID_16) { 474 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 475 c=unicodeCodeUnits[finalOffset]; 476 if(c<0xfffe) { 477 /* output BMP code point */ 478 } else { 479 c=U_SENTINEL; 480 } 481 } else if(action==MBCS_STATE_VALID_16_PAIR) { 482 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 483 c=unicodeCodeUnits[finalOffset++]; 484 if(c<0xd800) { 485 /* output BMP code point below 0xd800 */ 486 } else if(c<=0xdbff) { 487 /* output roundtrip or fallback supplementary code point */ 488 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 489 } else if(c==0xe000) { 490 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 491 c=unicodeCodeUnits[finalOffset]; 492 } else { 493 c=U_SENTINEL; 494 } 495 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 496 /* output supplementary code point */ 497 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 498 } else { 499 c=U_SENTINEL; 500 } 501 502 codePoints[b&0x1f]=c; 503 anyCodePoints&=c; 504 } 505 if(((++b)&0x1f)==0) { 506 if(anyCodePoints>=0) { 507 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 508 return FALSE; 509 } 510 anyCodePoints=-1; 511 } 512 } 513 } 514 return TRUE; 515} 516 517/* 518 * Only called if stateProps[state]==-1. 519 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 520 * MBCS_STATE_CHANGE_ONLY. 521 */ 522static int8_t 523getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 524 const int32_t *row; 525 int32_t min, max, entry, nextState; 526 527 row=stateTable[state]; 528 stateProps[state]=0; 529 530 /* find first non-ignorable state */ 531 for(min=0;; ++min) { 532 entry=row[min]; 533 nextState=MBCS_ENTRY_STATE(entry); 534 if(stateProps[nextState]==-1) { 535 getStateProp(stateTable, stateProps, nextState); 536 } 537 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 538 if(stateProps[nextState]>=0) { 539 break; 540 } 541 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 542 break; 543 } 544 if(min==0xff) { 545 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 546 return stateProps[state]; 547 } 548 } 549 stateProps[state]|=(int8_t)((min>>5)<<3); 550 551 /* find last non-ignorable state */ 552 for(max=0xff; min<max; --max) { 553 entry=row[max]; 554 nextState=MBCS_ENTRY_STATE(entry); 555 if(stateProps[nextState]==-1) { 556 getStateProp(stateTable, stateProps, nextState); 557 } 558 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 559 if(stateProps[nextState]>=0) { 560 break; 561 } 562 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 563 break; 564 } 565 } 566 stateProps[state]|=(int8_t)(max>>5); 567 568 /* recurse further and collect direct-state information */ 569 while(min<=max) { 570 entry=row[min]; 571 nextState=MBCS_ENTRY_STATE(entry); 572 if(stateProps[nextState]==-1) { 573 getStateProp(stateTable, stateProps, nextState); 574 } 575 if(MBCS_ENTRY_IS_FINAL(entry)) { 576 stateProps[nextState]|=0x40; 577 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 578 stateProps[state]|=0x40; 579 } 580 } 581 ++min; 582 } 583 return stateProps[state]; 584} 585 586/* 587 * Internal function enumerating the toUnicode data of an MBCS converter. 588 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 589 * table, but could also be used for a future ucnv_getUnicodeSet() option 590 * that includes reverse fallbacks (after updating this function's implementation). 591 * Currently only handles roundtrip mappings. 592 * Does not currently handle extensions. 593 */ 594static void 595ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 596 UConverterEnumToUCallback *callback, const void *context, 597 UErrorCode *pErrorCode) { 598 /* 599 * Properties for each state, to speed up the enumeration. 600 * Ignorable actions are unassigned/illegal/state-change-only: 601 * They do not lead to mappings. 602 * 603 * Bits 7..6: 604 * 1 direct/initial state (stateful converters have multiple) 605 * 0 non-initial state with transitions or with non-ignorable result actions 606 * -1 final state with only ignorable actions 607 * 608 * Bits 5..3: 609 * The lowest byte value with non-ignorable actions is 610 * value<<5 (rounded down). 611 * 612 * Bits 2..0: 613 * The highest byte value with non-ignorable actions is 614 * (value<<5)&0x1f (rounded up). 615 */ 616 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 617 int32_t state; 618 619 uprv_memset(stateProps, -1, sizeof(stateProps)); 620 621 /* recurse from state 0 and set all stateProps */ 622 getStateProp(mbcsTable->stateTable, stateProps, 0); 623 624 for(state=0; state<mbcsTable->countStates; ++state) { 625 /*if(stateProps[state]==-1) { 626 printf("unused/unreachable <icu:state> %d\n", state); 627 }*/ 628 if(stateProps[state]>=0x40) { 629 /* start from each direct state */ 630 enumToU( 631 mbcsTable, stateProps, state, 0, 0, 632 callback, context, 633 pErrorCode); 634 } 635 } 636} 637 638U_CFUNC void 639ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 640 const USetAdder *sa, 641 UConverterUnicodeSet which, 642 UConverterSetFilter filter, 643 UErrorCode *pErrorCode) { 644 const UConverterMBCSTable *mbcsTable; 645 const uint16_t *table; 646 647 uint32_t st3; 648 uint16_t st1, maxStage1, st2; 649 650 UChar32 c; 651 652 /* enumerate the from-Unicode trie table */ 653 mbcsTable=&sharedData->mbcs; 654 table=mbcsTable->fromUnicodeTable; 655 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 656 maxStage1=0x440; 657 } else { 658 maxStage1=0x40; 659 } 660 661 c=0; /* keep track of the current code point while enumerating */ 662 663 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 664 const uint16_t *stage2, *stage3, *results; 665 uint16_t minValue; 666 667 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 668 669 /* 670 * Set a threshold variable for selecting which mappings to use. 671 * See ucnv_MBCSSingleFromBMPWithOffsets() and 672 * MBCS_SINGLE_RESULT_FROM_U() for details. 673 */ 674 if(which==UCNV_ROUNDTRIP_SET) { 675 /* use only roundtrips */ 676 minValue=0xf00; 677 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 678 /* use all roundtrip and fallback results */ 679 minValue=0x800; 680 } 681 682 for(st1=0; st1<maxStage1; ++st1) { 683 st2=table[st1]; 684 if(st2>maxStage1) { 685 stage2=table+st2; 686 for(st2=0; st2<64; ++st2) { 687 if((st3=stage2[st2])!=0) { 688 /* read the stage 3 block */ 689 stage3=results+st3; 690 691 do { 692 if(*stage3++>=minValue) { 693 sa->add(sa->set, c); 694 } 695 } while((++c&0xf)!=0); 696 } else { 697 c+=16; /* empty stage 3 block */ 698 } 699 } 700 } else { 701 c+=1024; /* empty stage 2 block */ 702 } 703 } 704 } else { 705 const uint32_t *stage2; 706 const uint8_t *stage3, *bytes; 707 uint32_t st3Multiplier; 708 uint32_t value; 709 UBool useFallback; 710 711 bytes=mbcsTable->fromUnicodeBytes; 712 713 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 714 715 switch(mbcsTable->outputType) { 716 case MBCS_OUTPUT_3: 717 case MBCS_OUTPUT_4_EUC: 718 st3Multiplier=3; 719 break; 720 case MBCS_OUTPUT_4: 721 st3Multiplier=4; 722 break; 723 default: 724 st3Multiplier=2; 725 break; 726 } 727 728 for(st1=0; st1<maxStage1; ++st1) { 729 st2=table[st1]; 730 if(st2>(maxStage1>>1)) { 731 stage2=(const uint32_t *)table+st2; 732 for(st2=0; st2<64; ++st2) { 733 if((st3=stage2[st2])!=0) { 734 /* read the stage 3 block */ 735 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 736 737 /* get the roundtrip flags for the stage 3 block */ 738 st3>>=16; 739 740 /* 741 * Add code points for which the roundtrip flag is set, 742 * or which map to non-zero bytes if we use fallbacks. 743 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 744 */ 745 switch(filter) { 746 case UCNV_SET_FILTER_NONE: 747 do { 748 if(st3&1) { 749 sa->add(sa->set, c); 750 stage3+=st3Multiplier; 751 } else if(useFallback) { 752 uint8_t b=0; 753 switch(st3Multiplier) { 754 case 4: 755 b|=*stage3++; 756 case 3: 757 b|=*stage3++; 758 case 2: 759 b|=stage3[0]|stage3[1]; 760 stage3+=2; 761 default: 762 break; 763 } 764 if(b!=0) { 765 sa->add(sa->set, c); 766 } 767 } 768 st3>>=1; 769 } while((++c&0xf)!=0); 770 break; 771 case UCNV_SET_FILTER_DBCS_ONLY: 772 /* Ignore single-byte results (<0x100). */ 773 do { 774 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 775 sa->add(sa->set, c); 776 } 777 st3>>=1; 778 stage3+=2; /* +=st3Multiplier */ 779 } while((++c&0xf)!=0); 780 break; 781 case UCNV_SET_FILTER_2022_CN: 782 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 783 do { 784 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 785 sa->add(sa->set, c); 786 } 787 st3>>=1; 788 stage3+=3; /* +=st3Multiplier */ 789 } while((++c&0xf)!=0); 790 break; 791 case UCNV_SET_FILTER_SJIS: 792 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 793 do { 794 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 795 sa->add(sa->set, c); 796 } 797 st3>>=1; 798 stage3+=2; /* +=st3Multiplier */ 799 } while((++c&0xf)!=0); 800 break; 801 case UCNV_SET_FILTER_GR94DBCS: 802 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 803 do { 804 if( ((st3&1)!=0 || useFallback) && 805 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 806 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 807 ) { 808 sa->add(sa->set, c); 809 } 810 st3>>=1; 811 stage3+=2; /* +=st3Multiplier */ 812 } while((++c&0xf)!=0); 813 break; 814 case UCNV_SET_FILTER_HZ: 815 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 816 do { 817 if( ((st3&1)!=0 || useFallback) && 818 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 819 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 820 ) { 821 sa->add(sa->set, c); 822 } 823 st3>>=1; 824 stage3+=2; /* +=st3Multiplier */ 825 } while((++c&0xf)!=0); 826 break; 827 default: 828 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 829 return; 830 } 831 } else { 832 c+=16; /* empty stage 3 block */ 833 } 834 } 835 } else { 836 c+=1024; /* empty stage 2 block */ 837 } 838 } 839 } 840 841 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 842} 843 844U_CFUNC void 845ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 846 const USetAdder *sa, 847 UConverterUnicodeSet which, 848 UErrorCode *pErrorCode) { 849 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 850 sharedData, sa, which, 851 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 852 UCNV_SET_FILTER_DBCS_ONLY : 853 UCNV_SET_FILTER_NONE, 854 pErrorCode); 855} 856 857static void 858ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 859 const USetAdder *sa, 860 UConverterUnicodeSet which, 861 UErrorCode *pErrorCode) { 862 if(cnv->options&_MBCS_OPTION_GB18030) { 863 sa->addRange(sa->set, 0, 0xd7ff); 864 sa->addRange(sa->set, 0xe000, 0x10ffff); 865 } else { 866 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 867 } 868} 869 870/* conversion extensions for input not in the main table -------------------- */ 871 872/* 873 * Hardcoded extension handling for GB 18030. 874 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 875 * 876 * In the future, conversion extensions may handle m:n mappings and delta tables, 877 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 878 * 879 * If an input character cannot be mapped, then these functions set an error 880 * code. The framework will then call the callback function. 881 */ 882 883/* 884 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 885 * else return 0 after output has been written to the target 886 */ 887static UChar32 888_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 889 UChar32 cp, 890 const UChar **source, const UChar *sourceLimit, 891 uint8_t **target, const uint8_t *targetLimit, 892 int32_t **offsets, int32_t sourceIndex, 893 UBool flush, 894 UErrorCode *pErrorCode) { 895 const int32_t *cx; 896 897 cnv->useSubChar1=FALSE; 898 899 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 900 ucnv_extInitialMatchFromU( 901 cnv, cx, 902 cp, source, sourceLimit, 903 (char **)target, (char *)targetLimit, 904 offsets, sourceIndex, 905 flush, 906 pErrorCode) 907 ) { 908 return 0; /* an extension mapping handled the input */ 909 } 910 911 /* GB 18030 */ 912 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 913 const uint32_t *range; 914 int32_t i; 915 916 range=gb18030Ranges[0]; 917 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 918 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 919 /* found the Unicode code point, output the four-byte sequence for it */ 920 uint32_t linear; 921 char bytes[4]; 922 923 /* get the linear value of the first GB 18030 code in this range */ 924 linear=range[2]-LINEAR_18030_BASE; 925 926 /* add the offset from the beginning of the range */ 927 linear+=((uint32_t)cp-range[0]); 928 929 /* turn this into a four-byte sequence */ 930 bytes[3]=(char)(0x30+linear%10); linear/=10; 931 bytes[2]=(char)(0x81+linear%126); linear/=126; 932 bytes[1]=(char)(0x30+linear%10); linear/=10; 933 bytes[0]=(char)(0x81+linear); 934 935 /* output this sequence */ 936 ucnv_fromUWriteBytes(cnv, 937 bytes, 4, (char **)target, (char *)targetLimit, 938 offsets, sourceIndex, pErrorCode); 939 return 0; 940 } 941 } 942 } 943 944 /* no mapping */ 945 *pErrorCode=U_INVALID_CHAR_FOUND; 946 return cp; 947} 948 949/* 950 * Input sequence: cnv->toUBytes[0..length[ 951 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 952 * else return 0 after output has been written to the target 953 */ 954static int8_t 955_extToU(UConverter *cnv, const UConverterSharedData *sharedData, 956 int8_t length, 957 const uint8_t **source, const uint8_t *sourceLimit, 958 UChar **target, const UChar *targetLimit, 959 int32_t **offsets, int32_t sourceIndex, 960 UBool flush, 961 UErrorCode *pErrorCode) { 962 const int32_t *cx; 963 964 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 965 ucnv_extInitialMatchToU( 966 cnv, cx, 967 length, (const char **)source, (const char *)sourceLimit, 968 target, targetLimit, 969 offsets, sourceIndex, 970 flush, 971 pErrorCode) 972 ) { 973 return 0; /* an extension mapping handled the input */ 974 } 975 976 /* GB 18030 */ 977 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 978 const uint32_t *range; 979 uint32_t linear; 980 int32_t i; 981 982 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 983 range=gb18030Ranges[0]; 984 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 985 if(range[2]<=linear && linear<=range[3]) { 986 /* found the sequence, output the Unicode code point for it */ 987 *pErrorCode=U_ZERO_ERROR; 988 989 /* add the linear difference between the input and start sequences to the start code point */ 990 linear=range[0]+(linear-range[2]); 991 992 /* output this code point */ 993 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 994 995 return 0; 996 } 997 } 998 } 999 1000 /* no mapping */ 1001 *pErrorCode=U_INVALID_CHAR_FOUND; 1002 return length; 1003} 1004 1005/* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1006 1007/* 1008 * This code modifies a standard EBCDIC<->Unicode mapping table for 1009 * OS/390 (z/OS) Unix System Services (Open Edition). 1010 * The difference is in the mapping of Line Feed and New Line control codes: 1011 * Standard EBCDIC maps 1012 * 1013 * <U000A> \x25 |0 1014 * <U0085> \x15 |0 1015 * 1016 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1017 * mapping 1018 * 1019 * <U000A> \x15 |0 1020 * <U0085> \x25 |0 1021 * 1022 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1023 * by copying it into allocated memory and swapping the LF and NL values. 1024 * It allows to support the same EBCDIC charset in both versions without 1025 * duplicating the entire installed table. 1026 */ 1027 1028/* standard EBCDIC codes */ 1029#define EBCDIC_LF 0x25 1030#define EBCDIC_NL 0x15 1031 1032/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1033#define EBCDIC_RT_LF 0xf25 1034#define EBCDIC_RT_NL 0xf15 1035 1036/* Unicode code points */ 1037#define U_LF 0x0a 1038#define U_NL 0x85 1039 1040static UBool 1041_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1042 UConverterMBCSTable *mbcsTable; 1043 1044 const uint16_t *table, *results; 1045 const uint8_t *bytes; 1046 1047 int32_t (*newStateTable)[256]; 1048 uint16_t *newResults; 1049 uint8_t *p; 1050 char *name; 1051 1052 uint32_t stage2Entry; 1053 uint32_t size, sizeofFromUBytes; 1054 1055 mbcsTable=&sharedData->mbcs; 1056 1057 table=mbcsTable->fromUnicodeTable; 1058 bytes=mbcsTable->fromUnicodeBytes; 1059 results=(const uint16_t *)bytes; 1060 1061 /* 1062 * Check that this is an EBCDIC table with SBCS portion - 1063 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1064 * 1065 * If not, ignore the option. Options are always ignored if they do not apply. 1066 */ 1067 if(!( 1068 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1069 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1070 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1071 )) { 1072 return FALSE; 1073 } 1074 1075 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1076 if(!( 1077 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1078 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1079 )) { 1080 return FALSE; 1081 } 1082 } else /* MBCS_OUTPUT_2_SISO */ { 1083 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1084 if(!( 1085 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1086 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1087 )) { 1088 return FALSE; 1089 } 1090 1091 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1092 if(!( 1093 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1094 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1095 )) { 1096 return FALSE; 1097 } 1098 } 1099 1100 if(mbcsTable->fromUBytesLength>0) { 1101 /* 1102 * We _know_ the number of bytes in the fromUnicodeBytes array 1103 * starting with header.version 4.1. 1104 */ 1105 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1106 } else { 1107 /* 1108 * Otherwise: 1109 * There used to be code to enumerate the fromUnicode 1110 * trie and find the highest entry, but it was removed in ICU 3.2 1111 * because it was not tested and caused a low code coverage number. 1112 * See Jitterbug 3674. 1113 * This affects only some .cnv file formats with a header.version 1114 * below 4.1, and only when swaplfnl is requested. 1115 * 1116 * ucnvmbcs.c revision 1.99 is the last one with the 1117 * ucnv_MBCSSizeofFromUBytes() function. 1118 */ 1119 *pErrorCode=U_INVALID_FORMAT_ERROR; 1120 return FALSE; 1121 } 1122 1123 /* 1124 * The table has an appropriate format. 1125 * Allocate and build 1126 * - a modified to-Unicode state table 1127 * - a modified from-Unicode output array 1128 * - a converter name string with the swap option appended 1129 */ 1130 size= 1131 mbcsTable->countStates*1024+ 1132 sizeofFromUBytes+ 1133 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1134 p=(uint8_t *)uprv_malloc(size); 1135 if(p==NULL) { 1136 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1137 return FALSE; 1138 } 1139 1140 /* copy and modify the to-Unicode state table */ 1141 newStateTable=(int32_t (*)[256])p; 1142 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1143 1144 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1145 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1146 1147 /* copy and modify the from-Unicode result table */ 1148 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1149 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1150 1151 /* conveniently, the table access macros work on the left side of expressions */ 1152 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1153 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1154 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1155 } else /* MBCS_OUTPUT_2_SISO */ { 1156 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1157 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1158 1159 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1160 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1161 } 1162 1163 /* set the canonical converter name */ 1164 name=(char *)newResults+sizeofFromUBytes; 1165 uprv_strcpy(name, sharedData->staticData->name); 1166 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1167 1168 /* set the pointers */ 1169 umtx_lock(NULL); 1170 if(mbcsTable->swapLFNLStateTable==NULL) { 1171 mbcsTable->swapLFNLStateTable=newStateTable; 1172 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1173 mbcsTable->swapLFNLName=name; 1174 1175 newStateTable=NULL; 1176 } 1177 umtx_unlock(NULL); 1178 1179 /* release the allocated memory if another thread beat us to it */ 1180 if(newStateTable!=NULL) { 1181 uprv_free(newStateTable); 1182 } 1183 return TRUE; 1184} 1185 1186/* reconstitute omitted fromUnicode data ------------------------------------ */ 1187 1188/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1189static UBool U_CALLCONV 1190writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1191 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1192 const uint16_t *table; 1193 uint32_t *stage2; 1194 uint8_t *bytes, *p; 1195 UChar32 c; 1196 int32_t i, st3; 1197 1198 table=mbcsTable->fromUnicodeTable; 1199 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1200 1201 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1202 switch(mbcsTable->outputType) { 1203 case MBCS_OUTPUT_3_EUC: 1204 if(value<=0xffff) { 1205 /* short sequences are stored directly */ 1206 /* code set 0 or 1 */ 1207 } else if(value<=0x8effff) { 1208 /* code set 2 */ 1209 value&=0x7fff; 1210 } else /* first byte is 0x8f */ { 1211 /* code set 3 */ 1212 value&=0xff7f; 1213 } 1214 break; 1215 case MBCS_OUTPUT_4_EUC: 1216 if(value<=0xffffff) { 1217 /* short sequences are stored directly */ 1218 /* code set 0 or 1 */ 1219 } else if(value<=0x8effffff) { 1220 /* code set 2 */ 1221 value&=0x7fffff; 1222 } else /* first byte is 0x8f */ { 1223 /* code set 3 */ 1224 value&=0xff7fff; 1225 } 1226 break; 1227 default: 1228 break; 1229 } 1230 1231 for(i=0; i<=0x1f; ++value, ++i) { 1232 c=codePoints[i]; 1233 if(c<0) { 1234 continue; 1235 } 1236 1237 /* locate the stage 2 & 3 data */ 1238 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1239 p=bytes; 1240 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1241 1242 /* write the codepage bytes into stage 3 */ 1243 switch(mbcsTable->outputType) { 1244 case MBCS_OUTPUT_3: 1245 case MBCS_OUTPUT_4_EUC: 1246 p+=st3*3; 1247 p[0]=(uint8_t)(value>>16); 1248 p[1]=(uint8_t)(value>>8); 1249 p[2]=(uint8_t)value; 1250 break; 1251 case MBCS_OUTPUT_4: 1252 ((uint32_t *)p)[st3]=value; 1253 break; 1254 default: 1255 /* 2 bytes per character */ 1256 ((uint16_t *)p)[st3]=(uint16_t)value; 1257 break; 1258 } 1259 1260 /* set the roundtrip flag */ 1261 *stage2|=(1UL<<(16+(c&0xf))); 1262 } 1263 return TRUE; 1264 } 1265 1266static void 1267reconstituteData(UConverterMBCSTable *mbcsTable, 1268 uint32_t stage1Length, uint32_t stage2Length, 1269 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1270 UErrorCode *pErrorCode) { 1271 uint16_t *stage1; 1272 uint32_t *stage2; 1273 uint8_t *bytes; 1274 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1275 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1276 if(mbcsTable->reconstitutedData==NULL) { 1277 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1278 return; 1279 } 1280 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1281 1282 /* copy existing data and reroute the pointers */ 1283 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1284 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1285 1286 stage2=(uint32_t *)(stage1+stage1Length); 1287 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1288 mbcsTable->fromUnicodeTable+stage1Length, 1289 stage2Length*4); 1290 1291 mbcsTable->fromUnicodeTable=stage1; 1292 mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length); 1293 1294 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1295 stage2=(uint32_t *)stage1; 1296 1297 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1298 { 1299 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1300 int32_t stageUTF8Index=0; 1301 int32_t st1, st2, st3, i; 1302 1303 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1304 st2=stage1[st1]; 1305 if(st2!=stage1Length/2) { 1306 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1307 for(i=0; i<16; ++i) { 1308 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1309 if(st3!=0) { 1310 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1311 st3>>=4; 1312 /* 1313 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1314 * allocated together as a single 64-block for access from the mbcsIndex 1315 */ 1316 stage2[st2++]=st3++; 1317 stage2[st2++]=st3++; 1318 stage2[st2++]=st3++; 1319 stage2[st2++]=st3; 1320 } else { 1321 /* no stage 3 block, skip */ 1322 st2+=4; 1323 } 1324 } 1325 } else { 1326 /* no stage 2 block, skip */ 1327 stageUTF8Index+=16; 1328 } 1329 } 1330 } 1331 1332 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1333 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1334} 1335 1336/* MBCS setup functions ----------------------------------------------------- */ 1337 1338static void 1339ucnv_MBCSLoad(UConverterSharedData *sharedData, 1340 UConverterLoadArgs *pArgs, 1341 const uint8_t *raw, 1342 UErrorCode *pErrorCode) { 1343 UDataInfo info; 1344 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1345 _MBCSHeader *header=(_MBCSHeader *)raw; 1346 uint32_t offset; 1347 uint32_t headerLength; 1348 UBool noFromU=FALSE; 1349 1350 if(header->version[0]==4) { 1351 headerLength=MBCS_HEADER_V4_LENGTH; 1352 } else if(header->version[0]==5 && header->version[1]>=3 && 1353 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1354 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1355 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1356 } else { 1357 *pErrorCode=U_INVALID_TABLE_FORMAT; 1358 return; 1359 } 1360 1361 mbcsTable->outputType=(uint8_t)header->flags; 1362 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1363 *pErrorCode=U_INVALID_TABLE_FORMAT; 1364 return; 1365 } 1366 1367 /* extension data, header version 4.2 and higher */ 1368 offset=header->flags>>8; 1369 if(offset!=0) { 1370 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1371 } 1372 1373 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1374 UConverterLoadArgs args={ 0 }; 1375 UConverterSharedData *baseSharedData; 1376 const int32_t *extIndexes; 1377 const char *baseName; 1378 1379 /* extension-only file, load the base table and set values appropriately */ 1380 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1381 /* extension-only file without extension */ 1382 *pErrorCode=U_INVALID_TABLE_FORMAT; 1383 return; 1384 } 1385 1386 if(pArgs->nestedLoads!=1) { 1387 /* an extension table must not be loaded as a base table */ 1388 *pErrorCode=U_INVALID_TABLE_FILE; 1389 return; 1390 } 1391 1392 /* load the base table */ 1393 baseName=(const char *)header+headerLength*4; 1394 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1395 /* forbid loading this same extension-only file */ 1396 *pErrorCode=U_INVALID_TABLE_FORMAT; 1397 return; 1398 } 1399 1400 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1401 args.size=sizeof(UConverterLoadArgs); 1402 args.nestedLoads=2; 1403 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1404 args.reserved=pArgs->reserved; 1405 args.options=pArgs->options; 1406 args.pkg=pArgs->pkg; 1407 args.name=baseName; 1408 baseSharedData=ucnv_load(&args, pErrorCode); 1409 if(U_FAILURE(*pErrorCode)) { 1410 return; 1411 } 1412 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1413 baseSharedData->mbcs.baseSharedData!=NULL 1414 ) { 1415 ucnv_unload(baseSharedData); 1416 *pErrorCode=U_INVALID_TABLE_FORMAT; 1417 return; 1418 } 1419 if(pArgs->onlyTestIsLoadable) { 1420 /* 1421 * Exit as soon as we know that we can load the converter 1422 * and the format is valid and supported. 1423 * The worst that can happen in the following code is a memory 1424 * allocation error. 1425 */ 1426 ucnv_unload(baseSharedData); 1427 return; 1428 } 1429 1430 /* copy the base table data */ 1431 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1432 1433 /* overwrite values with relevant ones for the extension converter */ 1434 mbcsTable->baseSharedData=baseSharedData; 1435 mbcsTable->extIndexes=extIndexes; 1436 1437 /* 1438 * It would be possible to share the swapLFNL data with a base converter, 1439 * but the generated name would have to be different, and the memory 1440 * would have to be free'd only once. 1441 * It is easier to just create the data for the extension converter 1442 * separately when it is requested. 1443 */ 1444 mbcsTable->swapLFNLStateTable=NULL; 1445 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1446 mbcsTable->swapLFNLName=NULL; 1447 1448 /* 1449 * The reconstitutedData must be deleted only when the base converter 1450 * is unloaded. 1451 */ 1452 mbcsTable->reconstitutedData=NULL; 1453 1454 /* 1455 * Set a special, runtime-only outputType if the extension converter 1456 * is a DBCS version of a base converter that also maps single bytes. 1457 */ 1458 if( sharedData->staticData->conversionType==UCNV_DBCS || 1459 (sharedData->staticData->conversionType==UCNV_MBCS && 1460 sharedData->staticData->minBytesPerChar>=2) 1461 ) { 1462 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1463 /* the base converter is SI/SO-stateful */ 1464 int32_t entry; 1465 1466 /* get the dbcs state from the state table entry for SO=0x0e */ 1467 entry=mbcsTable->stateTable[0][0xe]; 1468 if( MBCS_ENTRY_IS_FINAL(entry) && 1469 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1470 MBCS_ENTRY_FINAL_STATE(entry)!=0 1471 ) { 1472 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1473 1474 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1475 } 1476 } else if( 1477 baseSharedData->staticData->conversionType==UCNV_MBCS && 1478 baseSharedData->staticData->minBytesPerChar==1 && 1479 baseSharedData->staticData->maxBytesPerChar==2 && 1480 mbcsTable->countStates<=127 1481 ) { 1482 /* non-stateful base converter, need to modify the state table */ 1483 int32_t (*newStateTable)[256]; 1484 int32_t *state; 1485 int32_t i, count; 1486 1487 /* allocate a new state table and copy the base state table contents */ 1488 count=mbcsTable->countStates; 1489 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1490 if(newStateTable==NULL) { 1491 ucnv_unload(baseSharedData); 1492 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1493 return; 1494 } 1495 1496 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1497 1498 /* change all final single-byte entries to go to a new all-illegal state */ 1499 state=newStateTable[0]; 1500 for(i=0; i<256; ++i) { 1501 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1502 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1503 } 1504 } 1505 1506 /* build the new all-illegal state */ 1507 state=newStateTable[count]; 1508 for(i=0; i<256; ++i) { 1509 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1510 } 1511 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1512 mbcsTable->countStates=(uint8_t)(count+1); 1513 mbcsTable->stateTableOwned=TRUE; 1514 1515 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1516 } 1517 } 1518 1519 /* 1520 * unlike below for files with base tables, do not get the unicodeMask 1521 * from the sharedData; instead, use the base table's unicodeMask, 1522 * which we copied in the memcpy above; 1523 * this is necessary because the static data unicodeMask, especially 1524 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1525 */ 1526 } else { 1527 /* conversion file with a base table; an additional extension table is optional */ 1528 /* make sure that the output type is known */ 1529 switch(mbcsTable->outputType) { 1530 case MBCS_OUTPUT_1: 1531 case MBCS_OUTPUT_2: 1532 case MBCS_OUTPUT_3: 1533 case MBCS_OUTPUT_4: 1534 case MBCS_OUTPUT_3_EUC: 1535 case MBCS_OUTPUT_4_EUC: 1536 case MBCS_OUTPUT_2_SISO: 1537 /* OK */ 1538 break; 1539 default: 1540 *pErrorCode=U_INVALID_TABLE_FORMAT; 1541 return; 1542 } 1543 if(pArgs->onlyTestIsLoadable) { 1544 /* 1545 * Exit as soon as we know that we can load the converter 1546 * and the format is valid and supported. 1547 * The worst that can happen in the following code is a memory 1548 * allocation error. 1549 */ 1550 return; 1551 } 1552 1553 mbcsTable->countStates=(uint8_t)header->countStates; 1554 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1555 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1556 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1557 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1558 1559 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1560 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1561 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1562 1563 /* 1564 * converter versions 6.1 and up contain a unicodeMask that is 1565 * used here to select the most efficient function implementations 1566 */ 1567 info.size=sizeof(UDataInfo); 1568 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1569 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1570 /* mask off possible future extensions to be safe */ 1571 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1572 } else { 1573 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1574 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1575 } 1576 1577 /* 1578 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1579 * Check for the header version, SBCS vs. MBCS, and for whether the 1580 * data structures are optimized for code points as high as what the 1581 * runtime code is designed for. 1582 * The implementation does not handle mapping tables with entries for 1583 * unpaired surrogates. 1584 */ 1585 if( header->version[1]>=3 && 1586 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1587 (mbcsTable->countStates==1 ? 1588 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1589 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1590 ) 1591 ) { 1592 mbcsTable->utf8Friendly=TRUE; 1593 1594 if(mbcsTable->countStates==1) { 1595 /* 1596 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1597 * Build a table with indexes to each block, to be used instead of 1598 * the regular stage 1/2 table. 1599 */ 1600 int32_t i; 1601 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1602 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1603 } 1604 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1605 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1606 } else { 1607 /* 1608 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1609 * The .cnv file is prebuilt with an additional stage table with indexes 1610 * to each block. 1611 */ 1612 mbcsTable->mbcsIndex=(const uint16_t *) 1613 (mbcsTable->fromUnicodeBytes+ 1614 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1615 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1616 } 1617 } 1618 1619 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1620 { 1621 uint32_t asciiRoundtrips=0xffffffff; 1622 int32_t i; 1623 1624 for(i=0; i<0x80; ++i) { 1625 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1626 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1627 } 1628 } 1629 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1630 } 1631 1632 if(noFromU) { 1633 uint32_t stage1Length= 1634 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1635 0x440 : 0x40; 1636 uint32_t stage2Length= 1637 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1638 stage1Length/2; 1639 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1640 } 1641 } 1642 1643 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1644 if(mbcsTable->utf8Friendly) { 1645 if(mbcsTable->countStates==1) { 1646 sharedData->impl=&_SBCSUTF8Impl; 1647 } else { 1648 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1649 sharedData->impl=&_DBCSUTF8Impl; 1650 } 1651 } 1652 } 1653 1654 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1655 /* 1656 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1657 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1658 */ 1659 mbcsTable->asciiRoundtrips=0; 1660 } 1661} 1662 1663static void 1664ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1665 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1666 1667 if(mbcsTable->swapLFNLStateTable!=NULL) { 1668 uprv_free(mbcsTable->swapLFNLStateTable); 1669 } 1670 if(mbcsTable->stateTableOwned) { 1671 uprv_free((void *)mbcsTable->stateTable); 1672 } 1673 if(mbcsTable->baseSharedData!=NULL) { 1674 ucnv_unload(mbcsTable->baseSharedData); 1675 } 1676 if(mbcsTable->reconstitutedData!=NULL) { 1677 uprv_free(mbcsTable->reconstitutedData); 1678 } 1679} 1680 1681static void 1682ucnv_MBCSOpen(UConverter *cnv, 1683 UConverterLoadArgs *pArgs, 1684 UErrorCode *pErrorCode) { 1685 UConverterMBCSTable *mbcsTable; 1686 const int32_t *extIndexes; 1687 uint8_t outputType; 1688 int8_t maxBytesPerUChar; 1689 1690 if(pArgs->onlyTestIsLoadable) { 1691 return; 1692 } 1693 1694 mbcsTable=&cnv->sharedData->mbcs; 1695 outputType=mbcsTable->outputType; 1696 1697 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1698 /* the swaplfnl option does not apply, remove it */ 1699 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1700 } 1701 1702 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1703 /* do this because double-checked locking is broken */ 1704 UBool isCached; 1705 1706 umtx_lock(NULL); 1707 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1708 umtx_unlock(NULL); 1709 1710 if(!isCached) { 1711 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1712 if(U_FAILURE(*pErrorCode)) { 1713 return; /* something went wrong */ 1714 } 1715 1716 /* the option does not apply, remove it */ 1717 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1718 } 1719 } 1720 } 1721 1722 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1723 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1724 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1725 cnv->options|=_MBCS_OPTION_GB18030; 1726 } 1727 } 1728 1729 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1730 if(outputType==MBCS_OUTPUT_2_SISO) { 1731 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1732 } 1733 1734 extIndexes=mbcsTable->extIndexes; 1735 if(extIndexes!=NULL) { 1736 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1737 if(outputType==MBCS_OUTPUT_2_SISO) { 1738 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1739 } 1740 1741 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1742 cnv->maxBytesPerUChar=maxBytesPerUChar; 1743 } 1744 } 1745 1746#if 0 1747 /* 1748 * documentation of UConverter fields used for status 1749 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1750 */ 1751 1752 /* toUnicode */ 1753 cnv->toUnicodeStatus=0; /* offset */ 1754 cnv->mode=0; /* state */ 1755 cnv->toULength=0; /* byteIndex */ 1756 1757 /* fromUnicode */ 1758 cnv->fromUChar32=0; 1759 cnv->fromUnicodeStatus=1; /* prevLength */ 1760#endif 1761} 1762 1763static const char * 1764ucnv_MBCSGetName(const UConverter *cnv) { 1765 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1766 return cnv->sharedData->mbcs.swapLFNLName; 1767 } else { 1768 return cnv->sharedData->staticData->name; 1769 } 1770} 1771 1772/* MBCS-to-Unicode conversion functions ------------------------------------- */ 1773 1774static UChar32 1775ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1776 const _MBCSToUFallback *toUFallbacks; 1777 uint32_t i, start, limit; 1778 1779 limit=mbcsTable->countToUFallbacks; 1780 if(limit>0) { 1781 /* do a binary search for the fallback mapping */ 1782 toUFallbacks=mbcsTable->toUFallbacks; 1783 start=0; 1784 while(start<limit-1) { 1785 i=(start+limit)/2; 1786 if(offset<toUFallbacks[i].offset) { 1787 limit=i; 1788 } else { 1789 start=i; 1790 } 1791 } 1792 1793 /* did we really find it? */ 1794 if(offset==toUFallbacks[start].offset) { 1795 return toUFallbacks[start].codePoint; 1796 } 1797 } 1798 1799 return 0xfffe; 1800} 1801 1802/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1803static void 1804ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1805 UErrorCode *pErrorCode) { 1806 UConverter *cnv; 1807 const uint8_t *source, *sourceLimit; 1808 UChar *target; 1809 const UChar *targetLimit; 1810 int32_t *offsets; 1811 1812 const int32_t (*stateTable)[256]; 1813 1814 int32_t sourceIndex; 1815 1816 int32_t entry; 1817 UChar c; 1818 uint8_t action; 1819 1820 /* set up the local pointers */ 1821 cnv=pArgs->converter; 1822 source=(const uint8_t *)pArgs->source; 1823 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1824 target=pArgs->target; 1825 targetLimit=pArgs->targetLimit; 1826 offsets=pArgs->offsets; 1827 1828 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1829 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1830 } else { 1831 stateTable=cnv->sharedData->mbcs.stateTable; 1832 } 1833 1834 /* sourceIndex=-1 if the current character began in the previous buffer */ 1835 sourceIndex=0; 1836 1837 /* conversion loop */ 1838 while(source<sourceLimit) { 1839 /* 1840 * This following test is to see if available input would overflow the output. 1841 * It does not catch output of more than one code unit that 1842 * overflows as a result of a surrogate pair or callback output 1843 * from the last source byte. 1844 * Therefore, those situations also test for overflows and will 1845 * then break the loop, too. 1846 */ 1847 if(target>=targetLimit) { 1848 /* target is full */ 1849 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1850 break; 1851 } 1852 1853 entry=stateTable[0][*source++]; 1854 /* MBCS_ENTRY_IS_FINAL(entry) */ 1855 1856 /* test the most common case first */ 1857 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1858 /* output BMP code point */ 1859 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1860 if(offsets!=NULL) { 1861 *offsets++=sourceIndex; 1862 } 1863 1864 /* normal end of action codes: prepare for a new character */ 1865 ++sourceIndex; 1866 continue; 1867 } 1868 1869 /* 1870 * An if-else-if chain provides more reliable performance for 1871 * the most common cases compared to a switch. 1872 */ 1873 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1874 if(action==MBCS_STATE_VALID_DIRECT_20 || 1875 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1876 ) { 1877 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1878 /* output surrogate pair */ 1879 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1880 if(offsets!=NULL) { 1881 *offsets++=sourceIndex; 1882 } 1883 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1884 if(target<targetLimit) { 1885 *target++=c; 1886 if(offsets!=NULL) { 1887 *offsets++=sourceIndex; 1888 } 1889 } else { 1890 /* target overflow */ 1891 cnv->UCharErrorBuffer[0]=c; 1892 cnv->UCharErrorBufferLength=1; 1893 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1894 break; 1895 } 1896 1897 ++sourceIndex; 1898 continue; 1899 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1900 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1901 /* output BMP code point */ 1902 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1903 if(offsets!=NULL) { 1904 *offsets++=sourceIndex; 1905 } 1906 1907 ++sourceIndex; 1908 continue; 1909 } 1910 } else if(action==MBCS_STATE_UNASSIGNED) { 1911 /* just fall through */ 1912 } else if(action==MBCS_STATE_ILLEGAL) { 1913 /* callback(illegal) */ 1914 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1915 } else { 1916 /* reserved, must never occur */ 1917 ++sourceIndex; 1918 continue; 1919 } 1920 1921 if(U_FAILURE(*pErrorCode)) { 1922 /* callback(illegal) */ 1923 break; 1924 } else /* unassigned sequences indicated with byteIndex>0 */ { 1925 /* try an extension mapping */ 1926 pArgs->source=(const char *)source; 1927 cnv->toUBytes[0]=*(source-1); 1928 cnv->toULength=_extToU(cnv, cnv->sharedData, 1929 1, &source, sourceLimit, 1930 &target, targetLimit, 1931 &offsets, sourceIndex, 1932 pArgs->flush, 1933 pErrorCode); 1934 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 1935 1936 if(U_FAILURE(*pErrorCode)) { 1937 /* not mappable or buffer overflow */ 1938 break; 1939 } 1940 } 1941 } 1942 1943 /* write back the updated pointers */ 1944 pArgs->source=(const char *)source; 1945 pArgs->target=target; 1946 pArgs->offsets=offsets; 1947} 1948 1949/* 1950 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 1951 * that only map to and from the BMP. 1952 * In addition to single-byte optimizations, the offset calculations 1953 * become much easier. 1954 */ 1955static void 1956ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 1957 UErrorCode *pErrorCode) { 1958 UConverter *cnv; 1959 const uint8_t *source, *sourceLimit, *lastSource; 1960 UChar *target; 1961 int32_t targetCapacity, length; 1962 int32_t *offsets; 1963 1964 const int32_t (*stateTable)[256]; 1965 1966 int32_t sourceIndex; 1967 1968 int32_t entry; 1969 uint8_t action; 1970 1971 /* set up the local pointers */ 1972 cnv=pArgs->converter; 1973 source=(const uint8_t *)pArgs->source; 1974 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1975 target=pArgs->target; 1976 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1977 offsets=pArgs->offsets; 1978 1979 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1980 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1981 } else { 1982 stateTable=cnv->sharedData->mbcs.stateTable; 1983 } 1984 1985 /* sourceIndex=-1 if the current character began in the previous buffer */ 1986 sourceIndex=0; 1987 lastSource=source; 1988 1989 /* 1990 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1991 * for the minimum of the sourceLength and targetCapacity 1992 */ 1993 length=(int32_t)(sourceLimit-source); 1994 if(length<targetCapacity) { 1995 targetCapacity=length; 1996 } 1997 1998#if MBCS_UNROLL_SINGLE_TO_BMP 1999 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2000 /* unroll the loop with the most common case */ 2001unrolled: 2002 if(targetCapacity>=16) { 2003 int32_t count, loops, oredEntries; 2004 2005 loops=count=targetCapacity>>4; 2006 do { 2007 oredEntries=entry=stateTable[0][*source++]; 2008 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2009 oredEntries|=entry=stateTable[0][*source++]; 2010 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2011 oredEntries|=entry=stateTable[0][*source++]; 2012 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2013 oredEntries|=entry=stateTable[0][*source++]; 2014 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2015 oredEntries|=entry=stateTable[0][*source++]; 2016 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2017 oredEntries|=entry=stateTable[0][*source++]; 2018 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2019 oredEntries|=entry=stateTable[0][*source++]; 2020 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2021 oredEntries|=entry=stateTable[0][*source++]; 2022 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2023 oredEntries|=entry=stateTable[0][*source++]; 2024 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2025 oredEntries|=entry=stateTable[0][*source++]; 2026 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2027 oredEntries|=entry=stateTable[0][*source++]; 2028 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2029 oredEntries|=entry=stateTable[0][*source++]; 2030 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2031 oredEntries|=entry=stateTable[0][*source++]; 2032 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2033 oredEntries|=entry=stateTable[0][*source++]; 2034 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2035 oredEntries|=entry=stateTable[0][*source++]; 2036 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2037 oredEntries|=entry=stateTable[0][*source++]; 2038 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2039 2040 /* were all 16 entries really valid? */ 2041 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2042 /* no, return to the first of these 16 */ 2043 source-=16; 2044 target-=16; 2045 break; 2046 } 2047 } while(--count>0); 2048 count=loops-count; 2049 targetCapacity-=16*count; 2050 2051 if(offsets!=NULL) { 2052 lastSource+=16*count; 2053 while(count>0) { 2054 *offsets++=sourceIndex++; 2055 *offsets++=sourceIndex++; 2056 *offsets++=sourceIndex++; 2057 *offsets++=sourceIndex++; 2058 *offsets++=sourceIndex++; 2059 *offsets++=sourceIndex++; 2060 *offsets++=sourceIndex++; 2061 *offsets++=sourceIndex++; 2062 *offsets++=sourceIndex++; 2063 *offsets++=sourceIndex++; 2064 *offsets++=sourceIndex++; 2065 *offsets++=sourceIndex++; 2066 *offsets++=sourceIndex++; 2067 *offsets++=sourceIndex++; 2068 *offsets++=sourceIndex++; 2069 *offsets++=sourceIndex++; 2070 --count; 2071 } 2072 } 2073 } 2074#endif 2075 2076 /* conversion loop */ 2077 while(targetCapacity>0) { 2078 entry=stateTable[0][*source++]; 2079 /* MBCS_ENTRY_IS_FINAL(entry) */ 2080 2081 /* test the most common case first */ 2082 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2083 /* output BMP code point */ 2084 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2085 --targetCapacity; 2086 continue; 2087 } 2088 2089 /* 2090 * An if-else-if chain provides more reliable performance for 2091 * the most common cases compared to a switch. 2092 */ 2093 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2094 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2095 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2096 /* output BMP code point */ 2097 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2098 --targetCapacity; 2099 continue; 2100 } 2101 } else if(action==MBCS_STATE_UNASSIGNED) { 2102 /* just fall through */ 2103 } else if(action==MBCS_STATE_ILLEGAL) { 2104 /* callback(illegal) */ 2105 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2106 } else { 2107 /* reserved, must never occur */ 2108 continue; 2109 } 2110 2111 /* set offsets since the start or the last extension */ 2112 if(offsets!=NULL) { 2113 int32_t count=(int32_t)(source-lastSource); 2114 2115 /* predecrement: do not set the offset for the callback-causing character */ 2116 while(--count>0) { 2117 *offsets++=sourceIndex++; 2118 } 2119 /* offset and sourceIndex are now set for the current character */ 2120 } 2121 2122 if(U_FAILURE(*pErrorCode)) { 2123 /* callback(illegal) */ 2124 break; 2125 } else /* unassigned sequences indicated with byteIndex>0 */ { 2126 /* try an extension mapping */ 2127 lastSource=source; 2128 cnv->toUBytes[0]=*(source-1); 2129 cnv->toULength=_extToU(cnv, cnv->sharedData, 2130 1, &source, sourceLimit, 2131 &target, pArgs->targetLimit, 2132 &offsets, sourceIndex, 2133 pArgs->flush, 2134 pErrorCode); 2135 sourceIndex+=1+(int32_t)(source-lastSource); 2136 2137 if(U_FAILURE(*pErrorCode)) { 2138 /* not mappable or buffer overflow */ 2139 break; 2140 } 2141 2142 /* recalculate the targetCapacity after an extension mapping */ 2143 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2144 length=(int32_t)(sourceLimit-source); 2145 if(length<targetCapacity) { 2146 targetCapacity=length; 2147 } 2148 } 2149 2150#if MBCS_UNROLL_SINGLE_TO_BMP 2151 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2152 goto unrolled; 2153#endif 2154 } 2155 2156 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2157 /* target is full */ 2158 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2159 } 2160 2161 /* set offsets since the start or the last callback */ 2162 if(offsets!=NULL) { 2163 size_t count=source-lastSource; 2164 while(count>0) { 2165 *offsets++=sourceIndex++; 2166 --count; 2167 } 2168 } 2169 2170 /* write back the updated pointers */ 2171 pArgs->source=(const char *)source; 2172 pArgs->target=target; 2173 pArgs->offsets=offsets; 2174} 2175 2176static UBool 2177hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2178 const int32_t *row=stateTable[state]; 2179 int32_t b, entry; 2180 /* First test for final entries in this state for some commonly valid byte values. */ 2181 entry=row[0xa1]; 2182 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2183 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2184 ) { 2185 return TRUE; 2186 } 2187 entry=row[0x41]; 2188 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2189 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2190 ) { 2191 return TRUE; 2192 } 2193 /* Then test for final entries in this state. */ 2194 for(b=0; b<=0xff; ++b) { 2195 entry=row[b]; 2196 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2197 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2198 ) { 2199 return TRUE; 2200 } 2201 } 2202 /* Then recurse for transition entries. */ 2203 for(b=0; b<=0xff; ++b) { 2204 entry=row[b]; 2205 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2206 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2207 ) { 2208 return TRUE; 2209 } 2210 } 2211 return FALSE; 2212} 2213 2214/* 2215 * Is byte b a single/lead byte in this state? 2216 * Recurse for transition states, because here we don't want to say that 2217 * b is a lead byte if all byte sequences that start with b are illegal. 2218 */ 2219static UBool 2220isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2221 const int32_t *row=stateTable[state]; 2222 int32_t entry=row[b]; 2223 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2224 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2225 } else { 2226 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2227 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2228 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2229 } else { 2230 return action!=MBCS_STATE_ILLEGAL; 2231 } 2232 } 2233} 2234 2235U_CFUNC void 2236ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2237 UErrorCode *pErrorCode) { 2238 UConverter *cnv; 2239 const uint8_t *source, *sourceLimit; 2240 UChar *target; 2241 const UChar *targetLimit; 2242 int32_t *offsets; 2243 2244 const int32_t (*stateTable)[256]; 2245 const uint16_t *unicodeCodeUnits; 2246 2247 uint32_t offset; 2248 uint8_t state; 2249 int8_t byteIndex; 2250 uint8_t *bytes; 2251 2252 int32_t sourceIndex, nextSourceIndex; 2253 2254 int32_t entry; 2255 UChar c; 2256 uint8_t action; 2257 2258 /* use optimized function if possible */ 2259 cnv=pArgs->converter; 2260 2261 if(cnv->preToULength>0) { 2262 /* 2263 * pass sourceIndex=-1 because we continue from an earlier buffer 2264 * in the future, this may change with continuous offsets 2265 */ 2266 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2267 2268 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2269 return; 2270 } 2271 } 2272 2273 if(cnv->sharedData->mbcs.countStates==1) { 2274 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2275 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2276 } else { 2277 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2278 } 2279 return; 2280 } 2281 2282 /* set up the local pointers */ 2283 source=(const uint8_t *)pArgs->source; 2284 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2285 target=pArgs->target; 2286 targetLimit=pArgs->targetLimit; 2287 offsets=pArgs->offsets; 2288 2289 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2290 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2291 } else { 2292 stateTable=cnv->sharedData->mbcs.stateTable; 2293 } 2294 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2295 2296 /* get the converter state from UConverter */ 2297 offset=cnv->toUnicodeStatus; 2298 byteIndex=cnv->toULength; 2299 bytes=cnv->toUBytes; 2300 2301 /* 2302 * if we are in the SBCS state for a DBCS-only converter, 2303 * then load the DBCS state from the MBCS data 2304 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2305 */ 2306 if((state=(uint8_t)(cnv->mode))==0) { 2307 state=cnv->sharedData->mbcs.dbcsOnlyState; 2308 } 2309 2310 /* sourceIndex=-1 if the current character began in the previous buffer */ 2311 sourceIndex=byteIndex==0 ? 0 : -1; 2312 nextSourceIndex=0; 2313 2314 /* conversion loop */ 2315 while(source<sourceLimit) { 2316 /* 2317 * This following test is to see if available input would overflow the output. 2318 * It does not catch output of more than one code unit that 2319 * overflows as a result of a surrogate pair or callback output 2320 * from the last source byte. 2321 * Therefore, those situations also test for overflows and will 2322 * then break the loop, too. 2323 */ 2324 if(target>=targetLimit) { 2325 /* target is full */ 2326 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2327 break; 2328 } 2329 2330 if(byteIndex==0) { 2331 /* optimized loop for 1/2-byte input and BMP output */ 2332 if(offsets==NULL) { 2333 do { 2334 entry=stateTable[state][*source]; 2335 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2336 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2337 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2338 2339 ++source; 2340 if( source<sourceLimit && 2341 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2342 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2343 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2344 ) { 2345 ++source; 2346 *target++=c; 2347 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2348 offset=0; 2349 } else { 2350 /* set the state and leave the optimized loop */ 2351 bytes[0]=*(source-1); 2352 byteIndex=1; 2353 break; 2354 } 2355 } else { 2356 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2357 /* output BMP code point */ 2358 ++source; 2359 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2360 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2361 } else { 2362 /* leave the optimized loop */ 2363 break; 2364 } 2365 } 2366 } while(source<sourceLimit && target<targetLimit); 2367 } else /* offsets!=NULL */ { 2368 do { 2369 entry=stateTable[state][*source]; 2370 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2371 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2372 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2373 2374 ++source; 2375 if( source<sourceLimit && 2376 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2377 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2378 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2379 ) { 2380 ++source; 2381 *target++=c; 2382 if(offsets!=NULL) { 2383 *offsets++=sourceIndex; 2384 sourceIndex=(nextSourceIndex+=2); 2385 } 2386 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2387 offset=0; 2388 } else { 2389 /* set the state and leave the optimized loop */ 2390 ++nextSourceIndex; 2391 bytes[0]=*(source-1); 2392 byteIndex=1; 2393 break; 2394 } 2395 } else { 2396 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2397 /* output BMP code point */ 2398 ++source; 2399 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2400 if(offsets!=NULL) { 2401 *offsets++=sourceIndex; 2402 sourceIndex=++nextSourceIndex; 2403 } 2404 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2405 } else { 2406 /* leave the optimized loop */ 2407 break; 2408 } 2409 } 2410 } while(source<sourceLimit && target<targetLimit); 2411 } 2412 2413 /* 2414 * these tests and break statements could be put inside the loop 2415 * if C had "break outerLoop" like Java 2416 */ 2417 if(source>=sourceLimit) { 2418 break; 2419 } 2420 if(target>=targetLimit) { 2421 /* target is full */ 2422 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2423 break; 2424 } 2425 2426 ++nextSourceIndex; 2427 bytes[byteIndex++]=*source++; 2428 } else /* byteIndex>0 */ { 2429 ++nextSourceIndex; 2430 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2431 } 2432 2433 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2434 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2435 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2436 continue; 2437 } 2438 2439 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2440 cnv->mode=state; 2441 2442 /* set the next state early so that we can reuse the entry variable */ 2443 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2444 2445 /* 2446 * An if-else-if chain provides more reliable performance for 2447 * the most common cases compared to a switch. 2448 */ 2449 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2450 if(action==MBCS_STATE_VALID_16) { 2451 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2452 c=unicodeCodeUnits[offset]; 2453 if(c<0xfffe) { 2454 /* output BMP code point */ 2455 *target++=c; 2456 if(offsets!=NULL) { 2457 *offsets++=sourceIndex; 2458 } 2459 byteIndex=0; 2460 } else if(c==0xfffe) { 2461 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2462 /* output fallback BMP code point */ 2463 *target++=(UChar)entry; 2464 if(offsets!=NULL) { 2465 *offsets++=sourceIndex; 2466 } 2467 byteIndex=0; 2468 } 2469 } else { 2470 /* callback(illegal) */ 2471 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2472 } 2473 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2474 /* output BMP code point */ 2475 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2476 if(offsets!=NULL) { 2477 *offsets++=sourceIndex; 2478 } 2479 byteIndex=0; 2480 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2481 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2482 c=unicodeCodeUnits[offset++]; 2483 if(c<0xd800) { 2484 /* output BMP code point below 0xd800 */ 2485 *target++=c; 2486 if(offsets!=NULL) { 2487 *offsets++=sourceIndex; 2488 } 2489 byteIndex=0; 2490 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2491 /* output roundtrip or fallback surrogate pair */ 2492 *target++=(UChar)(c&0xdbff); 2493 if(offsets!=NULL) { 2494 *offsets++=sourceIndex; 2495 } 2496 byteIndex=0; 2497 if(target<targetLimit) { 2498 *target++=unicodeCodeUnits[offset]; 2499 if(offsets!=NULL) { 2500 *offsets++=sourceIndex; 2501 } 2502 } else { 2503 /* target overflow */ 2504 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2505 cnv->UCharErrorBufferLength=1; 2506 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2507 2508 offset=0; 2509 break; 2510 } 2511 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2512 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2513 *target++=unicodeCodeUnits[offset]; 2514 if(offsets!=NULL) { 2515 *offsets++=sourceIndex; 2516 } 2517 byteIndex=0; 2518 } else if(c==0xffff) { 2519 /* callback(illegal) */ 2520 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2521 } 2522 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2523 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2524 ) { 2525 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2526 /* output surrogate pair */ 2527 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2528 if(offsets!=NULL) { 2529 *offsets++=sourceIndex; 2530 } 2531 byteIndex=0; 2532 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2533 if(target<targetLimit) { 2534 *target++=c; 2535 if(offsets!=NULL) { 2536 *offsets++=sourceIndex; 2537 } 2538 } else { 2539 /* target overflow */ 2540 cnv->UCharErrorBuffer[0]=c; 2541 cnv->UCharErrorBufferLength=1; 2542 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2543 2544 offset=0; 2545 break; 2546 } 2547 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2548 /* 2549 * This serves as a state change without any output. 2550 * It is useful for reading simple stateful encodings, 2551 * for example using just Shift-In/Shift-Out codes. 2552 * The 21 unused bits may later be used for more sophisticated 2553 * state transitions. 2554 */ 2555 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2556 byteIndex=0; 2557 } else { 2558 /* SI/SO are illegal for DBCS-only conversion */ 2559 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2560 2561 /* callback(illegal) */ 2562 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2563 } 2564 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2565 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2566 /* output BMP code point */ 2567 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2568 if(offsets!=NULL) { 2569 *offsets++=sourceIndex; 2570 } 2571 byteIndex=0; 2572 } 2573 } else if(action==MBCS_STATE_UNASSIGNED) { 2574 /* just fall through */ 2575 } else if(action==MBCS_STATE_ILLEGAL) { 2576 /* callback(illegal) */ 2577 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2578 } else { 2579 /* reserved, must never occur */ 2580 byteIndex=0; 2581 } 2582 2583 /* end of action codes: prepare for a new character */ 2584 offset=0; 2585 2586 if(byteIndex==0) { 2587 sourceIndex=nextSourceIndex; 2588 } else if(U_FAILURE(*pErrorCode)) { 2589 /* callback(illegal) */ 2590 if(byteIndex>1) { 2591 /* 2592 * Ticket 5691: consistent illegal sequences: 2593 * - We include at least the first byte in the illegal sequence. 2594 * - If any of the non-initial bytes could be the start of a character, 2595 * we stop the illegal sequence before the first one of those. 2596 */ 2597 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2598 int8_t i; 2599 for(i=1; 2600 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2601 ++i) {} 2602 if(i<byteIndex) { 2603 /* Back out some bytes. */ 2604 int8_t backOutDistance=byteIndex-i; 2605 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2606 byteIndex=i; /* length of reported illegal byte sequence */ 2607 if(backOutDistance<=bytesFromThisBuffer) { 2608 source-=backOutDistance; 2609 } else { 2610 /* Back out bytes from the previous buffer: Need to replay them. */ 2611 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2612 /* preToULength is negative! */ 2613 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2614 source=(const uint8_t *)pArgs->source; 2615 } 2616 } 2617 } 2618 break; 2619 } else /* unassigned sequences indicated with byteIndex>0 */ { 2620 /* try an extension mapping */ 2621 pArgs->source=(const char *)source; 2622 byteIndex=_extToU(cnv, cnv->sharedData, 2623 byteIndex, &source, sourceLimit, 2624 &target, targetLimit, 2625 &offsets, sourceIndex, 2626 pArgs->flush, 2627 pErrorCode); 2628 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2629 2630 if(U_FAILURE(*pErrorCode)) { 2631 /* not mappable or buffer overflow */ 2632 break; 2633 } 2634 } 2635 } 2636 2637 /* set the converter state back into UConverter */ 2638 cnv->toUnicodeStatus=offset; 2639 cnv->mode=state; 2640 cnv->toULength=byteIndex; 2641 2642 /* write back the updated pointers */ 2643 pArgs->source=(const char *)source; 2644 pArgs->target=target; 2645 pArgs->offsets=offsets; 2646} 2647 2648/* 2649 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2650 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2651 */ 2652static UChar32 2653ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2654 UErrorCode *pErrorCode) { 2655 UConverter *cnv; 2656 const int32_t (*stateTable)[256]; 2657 const uint8_t *source, *sourceLimit; 2658 2659 int32_t entry; 2660 uint8_t action; 2661 2662 /* set up the local pointers */ 2663 cnv=pArgs->converter; 2664 source=(const uint8_t *)pArgs->source; 2665 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2666 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2667 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2668 } else { 2669 stateTable=cnv->sharedData->mbcs.stateTable; 2670 } 2671 2672 /* conversion loop */ 2673 while(source<sourceLimit) { 2674 entry=stateTable[0][*source++]; 2675 /* MBCS_ENTRY_IS_FINAL(entry) */ 2676 2677 /* write back the updated pointer early so that we can return directly */ 2678 pArgs->source=(const char *)source; 2679 2680 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2681 /* output BMP code point */ 2682 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2683 } 2684 2685 /* 2686 * An if-else-if chain provides more reliable performance for 2687 * the most common cases compared to a switch. 2688 */ 2689 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2690 if( action==MBCS_STATE_VALID_DIRECT_20 || 2691 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2692 ) { 2693 /* output supplementary code point */ 2694 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2695 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2696 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2697 /* output BMP code point */ 2698 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2699 } 2700 } else if(action==MBCS_STATE_UNASSIGNED) { 2701 /* just fall through */ 2702 } else if(action==MBCS_STATE_ILLEGAL) { 2703 /* callback(illegal) */ 2704 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2705 } else { 2706 /* reserved, must never occur */ 2707 continue; 2708 } 2709 2710 if(U_FAILURE(*pErrorCode)) { 2711 /* callback(illegal) */ 2712 break; 2713 } else /* unassigned sequence */ { 2714 /* defer to the generic implementation */ 2715 pArgs->source=(const char *)source-1; 2716 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2717 } 2718 } 2719 2720 /* no output because of empty input or only state changes */ 2721 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2722 return 0xffff; 2723} 2724 2725/* 2726 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2727 * conversion without offset handling. 2728 * 2729 * When a character does not have a mapping to Unicode, then we return to the 2730 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2731 * handling. 2732 * We also defer to the generic code in other complicated cases and have them 2733 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2734 * 2735 * All normal mappings and errors are handled here. 2736 */ 2737static UChar32 2738ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2739 UErrorCode *pErrorCode) { 2740 UConverter *cnv; 2741 const uint8_t *source, *sourceLimit, *lastSource; 2742 2743 const int32_t (*stateTable)[256]; 2744 const uint16_t *unicodeCodeUnits; 2745 2746 uint32_t offset; 2747 uint8_t state; 2748 2749 int32_t entry; 2750 UChar32 c; 2751 uint8_t action; 2752 2753 /* use optimized function if possible */ 2754 cnv=pArgs->converter; 2755 2756 if(cnv->preToULength>0) { 2757 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2758 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2759 } 2760 2761 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2762 /* 2763 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2764 * with the rare case of a codepage that maps single surrogates 2765 * without adding the complexity to this already complicated function here. 2766 */ 2767 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2768 } else if(cnv->sharedData->mbcs.countStates==1) { 2769 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2770 } 2771 2772 /* set up the local pointers */ 2773 source=lastSource=(const uint8_t *)pArgs->source; 2774 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2775 2776 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2777 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2778 } else { 2779 stateTable=cnv->sharedData->mbcs.stateTable; 2780 } 2781 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2782 2783 /* get the converter state from UConverter */ 2784 offset=cnv->toUnicodeStatus; 2785 2786 /* 2787 * if we are in the SBCS state for a DBCS-only converter, 2788 * then load the DBCS state from the MBCS data 2789 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2790 */ 2791 if((state=(uint8_t)(cnv->mode))==0) { 2792 state=cnv->sharedData->mbcs.dbcsOnlyState; 2793 } 2794 2795 /* conversion loop */ 2796 c=U_SENTINEL; 2797 while(source<sourceLimit) { 2798 entry=stateTable[state][*source++]; 2799 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2800 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2801 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2802 2803 /* optimization for 1/2-byte input and BMP output */ 2804 if( source<sourceLimit && 2805 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2806 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2807 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2808 ) { 2809 ++source; 2810 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2811 /* output BMP code point */ 2812 break; 2813 } 2814 } else { 2815 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2816 cnv->mode=state; 2817 2818 /* set the next state early so that we can reuse the entry variable */ 2819 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2820 2821 /* 2822 * An if-else-if chain provides more reliable performance for 2823 * the most common cases compared to a switch. 2824 */ 2825 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2826 if(action==MBCS_STATE_VALID_DIRECT_16) { 2827 /* output BMP code point */ 2828 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2829 break; 2830 } else if(action==MBCS_STATE_VALID_16) { 2831 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2832 c=unicodeCodeUnits[offset]; 2833 if(c<0xfffe) { 2834 /* output BMP code point */ 2835 break; 2836 } else if(c==0xfffe) { 2837 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2838 break; 2839 } 2840 } else { 2841 /* callback(illegal) */ 2842 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2843 } 2844 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2845 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2846 c=unicodeCodeUnits[offset++]; 2847 if(c<0xd800) { 2848 /* output BMP code point below 0xd800 */ 2849 break; 2850 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2851 /* output roundtrip or fallback supplementary code point */ 2852 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 2853 break; 2854 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2855 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2856 c=unicodeCodeUnits[offset]; 2857 break; 2858 } else if(c==0xffff) { 2859 /* callback(illegal) */ 2860 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2861 } 2862 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2863 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2864 ) { 2865 /* output supplementary code point */ 2866 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2867 break; 2868 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2869 /* 2870 * This serves as a state change without any output. 2871 * It is useful for reading simple stateful encodings, 2872 * for example using just Shift-In/Shift-Out codes. 2873 * The 21 unused bits may later be used for more sophisticated 2874 * state transitions. 2875 */ 2876 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 2877 /* SI/SO are illegal for DBCS-only conversion */ 2878 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2879 2880 /* callback(illegal) */ 2881 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2882 } 2883 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2884 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2885 /* output BMP code point */ 2886 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2887 break; 2888 } 2889 } else if(action==MBCS_STATE_UNASSIGNED) { 2890 /* just fall through */ 2891 } else if(action==MBCS_STATE_ILLEGAL) { 2892 /* callback(illegal) */ 2893 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2894 } else { 2895 /* reserved (must never occur), or only state change */ 2896 offset=0; 2897 lastSource=source; 2898 continue; 2899 } 2900 2901 /* end of action codes: prepare for a new character */ 2902 offset=0; 2903 2904 if(U_FAILURE(*pErrorCode)) { 2905 /* callback(illegal) */ 2906 break; 2907 } else /* unassigned sequence */ { 2908 /* defer to the generic implementation */ 2909 cnv->toUnicodeStatus=0; 2910 cnv->mode=state; 2911 pArgs->source=(const char *)lastSource; 2912 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2913 } 2914 } 2915 } 2916 2917 if(c<0) { 2918 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 2919 /* incomplete character byte sequence */ 2920 uint8_t *bytes=cnv->toUBytes; 2921 cnv->toULength=(int8_t)(source-lastSource); 2922 do { 2923 *bytes++=*lastSource++; 2924 } while(lastSource<source); 2925 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 2926 } else if(U_FAILURE(*pErrorCode)) { 2927 /* callback(illegal) */ 2928 /* 2929 * Ticket 5691: consistent illegal sequences: 2930 * - We include at least the first byte in the illegal sequence. 2931 * - If any of the non-initial bytes could be the start of a character, 2932 * we stop the illegal sequence before the first one of those. 2933 */ 2934 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2935 uint8_t *bytes=cnv->toUBytes; 2936 *bytes++=*lastSource++; /* first byte */ 2937 if(lastSource==source) { 2938 cnv->toULength=1; 2939 } else /* lastSource<source: multi-byte character */ { 2940 int8_t i; 2941 for(i=1; 2942 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 2943 ++i 2944 ) { 2945 *bytes++=*lastSource++; 2946 } 2947 cnv->toULength=i; 2948 source=lastSource; 2949 } 2950 } else { 2951 /* no output because of empty input or only state changes */ 2952 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2953 } 2954 c=0xffff; 2955 } 2956 2957 /* set the converter state back into UConverter, ready for a new character */ 2958 cnv->toUnicodeStatus=0; 2959 cnv->mode=state; 2960 2961 /* write back the updated pointer */ 2962 pArgs->source=(const char *)source; 2963 return c; 2964} 2965 2966#if 0 2967/* 2968 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 2969 * Removal improves code coverage. 2970 */ 2971/** 2972 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 2973 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 2974 * It does not handle conversion extensions (_extToU()). 2975 */ 2976U_CFUNC UChar32 2977ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 2978 uint8_t b, UBool useFallback) { 2979 int32_t entry; 2980 uint8_t action; 2981 2982 entry=sharedData->mbcs.stateTable[0][b]; 2983 /* MBCS_ENTRY_IS_FINAL(entry) */ 2984 2985 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2986 /* output BMP code point */ 2987 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2988 } 2989 2990 /* 2991 * An if-else-if chain provides more reliable performance for 2992 * the most common cases compared to a switch. 2993 */ 2994 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2995 if(action==MBCS_STATE_VALID_DIRECT_20) { 2996 /* output supplementary code point */ 2997 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 2998 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2999 if(!TO_U_USE_FALLBACK(useFallback)) { 3000 return 0xfffe; 3001 } 3002 /* output BMP code point */ 3003 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3004 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3005 if(!TO_U_USE_FALLBACK(useFallback)) { 3006 return 0xfffe; 3007 } 3008 /* output supplementary code point */ 3009 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3010 } else if(action==MBCS_STATE_UNASSIGNED) { 3011 return 0xfffe; 3012 } else if(action==MBCS_STATE_ILLEGAL) { 3013 return 0xffff; 3014 } else { 3015 /* reserved, must never occur */ 3016 return 0xffff; 3017 } 3018} 3019#endif 3020 3021/* 3022 * This is a simple version of _MBCSGetNextUChar() that is used 3023 * by other converter implementations. 3024 * It only returns an "assigned" result if it consumes the entire input. 3025 * It does not use state from the converter, nor error codes. 3026 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3027 * It handles conversion extensions but not GB 18030. 3028 * 3029 * Return value: 3030 * U+fffe unassigned 3031 * U+ffff illegal 3032 * otherwise the Unicode code point 3033 */ 3034U_CFUNC UChar32 3035ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3036 const char *source, int32_t length, 3037 UBool useFallback) { 3038 const int32_t (*stateTable)[256]; 3039 const uint16_t *unicodeCodeUnits; 3040 3041 uint32_t offset; 3042 uint8_t state, action; 3043 3044 UChar32 c; 3045 int32_t i, entry; 3046 3047 if(length<=0) { 3048 /* no input at all: "illegal" */ 3049 return 0xffff; 3050 } 3051 3052#if 0 3053/* 3054 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3055 * TODO In future releases, verify that this function is never called for SBCS 3056 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3057 * Removal improves code coverage. 3058 */ 3059 /* use optimized function if possible */ 3060 if(sharedData->mbcs.countStates==1) { 3061 if(length==1) { 3062 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3063 } else { 3064 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3065 } 3066 } 3067#endif 3068 3069 /* set up the local pointers */ 3070 stateTable=sharedData->mbcs.stateTable; 3071 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3072 3073 /* converter state */ 3074 offset=0; 3075 state=sharedData->mbcs.dbcsOnlyState; 3076 3077 /* conversion loop */ 3078 for(i=0;;) { 3079 entry=stateTable[state][(uint8_t)source[i++]]; 3080 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3081 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3082 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3083 3084 if(i==length) { 3085 return 0xffff; /* truncated character */ 3086 } 3087 } else { 3088 /* 3089 * An if-else-if chain provides more reliable performance for 3090 * the most common cases compared to a switch. 3091 */ 3092 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3093 if(action==MBCS_STATE_VALID_16) { 3094 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3095 c=unicodeCodeUnits[offset]; 3096 if(c!=0xfffe) { 3097 /* done */ 3098 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3099 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3100 /* else done with 0xfffe */ 3101 } 3102 break; 3103 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3104 /* output BMP code point */ 3105 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3106 break; 3107 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3108 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3109 c=unicodeCodeUnits[offset++]; 3110 if(c<0xd800) { 3111 /* output BMP code point below 0xd800 */ 3112 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3113 /* output roundtrip or fallback supplementary code point */ 3114 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3115 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3116 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3117 c=unicodeCodeUnits[offset]; 3118 } else if(c==0xffff) { 3119 return 0xffff; 3120 } else { 3121 c=0xfffe; 3122 } 3123 break; 3124 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3125 /* output supplementary code point */ 3126 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3127 break; 3128 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3129 if(!TO_U_USE_FALLBACK(useFallback)) { 3130 c=0xfffe; 3131 break; 3132 } 3133 /* output BMP code point */ 3134 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3135 break; 3136 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3137 if(!TO_U_USE_FALLBACK(useFallback)) { 3138 c=0xfffe; 3139 break; 3140 } 3141 /* output supplementary code point */ 3142 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3143 break; 3144 } else if(action==MBCS_STATE_UNASSIGNED) { 3145 c=0xfffe; 3146 break; 3147 } 3148 3149 /* 3150 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3151 * and MBCS_STATE_ILLEGAL and reserved action codes 3152 */ 3153 return 0xffff; 3154 } 3155 } 3156 3157 if(i!=length) { 3158 /* illegal for this function: not all input consumed */ 3159 return 0xffff; 3160 } 3161 3162 if(c==0xfffe) { 3163 /* try an extension mapping */ 3164 const int32_t *cx=sharedData->mbcs.extIndexes; 3165 if(cx!=NULL) { 3166 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3167 } 3168 } 3169 3170 return c; 3171} 3172 3173/* MBCS-from-Unicode conversion functions ----------------------------------- */ 3174 3175/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3176static void 3177ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3178 UErrorCode *pErrorCode) { 3179 UConverter *cnv; 3180 const UChar *source, *sourceLimit; 3181 uint8_t *target; 3182 int32_t targetCapacity; 3183 int32_t *offsets; 3184 3185 const uint16_t *table; 3186 const uint16_t *mbcsIndex; 3187 const uint8_t *bytes; 3188 3189 UChar32 c; 3190 3191 int32_t sourceIndex, nextSourceIndex; 3192 3193 uint32_t stage2Entry; 3194 uint32_t asciiRoundtrips; 3195 uint32_t value; 3196 uint8_t unicodeMask; 3197 3198 /* use optimized function if possible */ 3199 cnv=pArgs->converter; 3200 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3201 3202 /* set up the local pointers */ 3203 source=pArgs->source; 3204 sourceLimit=pArgs->sourceLimit; 3205 target=(uint8_t *)pArgs->target; 3206 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3207 offsets=pArgs->offsets; 3208 3209 table=cnv->sharedData->mbcs.fromUnicodeTable; 3210 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3211 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3212 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3213 } else { 3214 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3215 } 3216 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3217 3218 /* get the converter state from UConverter */ 3219 c=cnv->fromUChar32; 3220 3221 /* sourceIndex=-1 if the current character began in the previous buffer */ 3222 sourceIndex= c==0 ? 0 : -1; 3223 nextSourceIndex=0; 3224 3225 /* conversion loop */ 3226 if(c!=0 && targetCapacity>0) { 3227 goto getTrail; 3228 } 3229 3230 while(source<sourceLimit) { 3231 /* 3232 * This following test is to see if available input would overflow the output. 3233 * It does not catch output of more than one byte that 3234 * overflows as a result of a multi-byte character or callback output 3235 * from the last source character. 3236 * Therefore, those situations also test for overflows and will 3237 * then break the loop, too. 3238 */ 3239 if(targetCapacity>0) { 3240 /* 3241 * Get a correct Unicode code point: 3242 * a single UChar for a BMP code point or 3243 * a matched surrogate pair for a "supplementary code point". 3244 */ 3245 c=*source++; 3246 ++nextSourceIndex; 3247 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3248 *target++=(uint8_t)c; 3249 if(offsets!=NULL) { 3250 *offsets++=sourceIndex; 3251 sourceIndex=nextSourceIndex; 3252 } 3253 --targetCapacity; 3254 c=0; 3255 continue; 3256 } 3257 /* 3258 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3259 * to avoid dealing with surrogates. 3260 * MBCS_FAST_MAX must be >=0xd7ff. 3261 */ 3262 if(c<=0xd7ff) { 3263 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3264 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3265 if(value==0) { 3266 goto unassigned; 3267 } 3268 /* output the value */ 3269 } else { 3270 /* 3271 * This also tests if the codepage maps single surrogates. 3272 * If it does, then surrogates are not paired but mapped separately. 3273 * Note that in this case unmatched surrogates are not detected. 3274 */ 3275 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3276 if(UTF_IS_SURROGATE_FIRST(c)) { 3277getTrail: 3278 if(source<sourceLimit) { 3279 /* test the following code unit */ 3280 UChar trail=*source; 3281 if(UTF_IS_SECOND_SURROGATE(trail)) { 3282 ++source; 3283 ++nextSourceIndex; 3284 c=UTF16_GET_PAIR_VALUE(c, trail); 3285 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3286 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3287 /* callback(unassigned) */ 3288 goto unassigned; 3289 } 3290 /* convert this supplementary code point */ 3291 /* exit this condition tree */ 3292 } else { 3293 /* this is an unmatched lead code unit (1st surrogate) */ 3294 /* callback(illegal) */ 3295 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3296 break; 3297 } 3298 } else { 3299 /* no more input */ 3300 break; 3301 } 3302 } else { 3303 /* this is an unmatched trail code unit (2nd surrogate) */ 3304 /* callback(illegal) */ 3305 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3306 break; 3307 } 3308 } 3309 3310 /* convert the Unicode code point in c into codepage bytes */ 3311 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3312 3313 /* get the bytes and the length for the output */ 3314 /* MBCS_OUTPUT_2 */ 3315 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3316 3317 /* is this code point assigned, or do we use fallbacks? */ 3318 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3319 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3320 ) { 3321 /* 3322 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3323 * There is no way with this data structure for fallback output 3324 * to be a zero byte. 3325 */ 3326 3327unassigned: 3328 /* try an extension mapping */ 3329 pArgs->source=source; 3330 c=_extFromU(cnv, cnv->sharedData, 3331 c, &source, sourceLimit, 3332 &target, target+targetCapacity, 3333 &offsets, sourceIndex, 3334 pArgs->flush, 3335 pErrorCode); 3336 nextSourceIndex+=(int32_t)(source-pArgs->source); 3337 3338 if(U_FAILURE(*pErrorCode)) { 3339 /* not mappable or buffer overflow */ 3340 break; 3341 } else { 3342 /* a mapping was written to the target, continue */ 3343 3344 /* recalculate the targetCapacity after an extension mapping */ 3345 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3346 3347 /* normal end of conversion: prepare for a new character */ 3348 sourceIndex=nextSourceIndex; 3349 continue; 3350 } 3351 } 3352 } 3353 3354 /* write the output character bytes from value and length */ 3355 /* from the first if in the loop we know that targetCapacity>0 */ 3356 if(value<=0xff) { 3357 /* this is easy because we know that there is enough space */ 3358 *target++=(uint8_t)value; 3359 if(offsets!=NULL) { 3360 *offsets++=sourceIndex; 3361 } 3362 --targetCapacity; 3363 } else /* length==2 */ { 3364 *target++=(uint8_t)(value>>8); 3365 if(2<=targetCapacity) { 3366 *target++=(uint8_t)value; 3367 if(offsets!=NULL) { 3368 *offsets++=sourceIndex; 3369 *offsets++=sourceIndex; 3370 } 3371 targetCapacity-=2; 3372 } else { 3373 if(offsets!=NULL) { 3374 *offsets++=sourceIndex; 3375 } 3376 cnv->charErrorBuffer[0]=(char)value; 3377 cnv->charErrorBufferLength=1; 3378 3379 /* target overflow */ 3380 targetCapacity=0; 3381 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3382 c=0; 3383 break; 3384 } 3385 } 3386 3387 /* normal end of conversion: prepare for a new character */ 3388 c=0; 3389 sourceIndex=nextSourceIndex; 3390 continue; 3391 } else { 3392 /* target is full */ 3393 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3394 break; 3395 } 3396 } 3397 3398 /* set the converter state back into UConverter */ 3399 cnv->fromUChar32=c; 3400 3401 /* write back the updated pointers */ 3402 pArgs->source=source; 3403 pArgs->target=(char *)target; 3404 pArgs->offsets=offsets; 3405} 3406 3407/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3408static void 3409ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3410 UErrorCode *pErrorCode) { 3411 UConverter *cnv; 3412 const UChar *source, *sourceLimit; 3413 uint8_t *target; 3414 int32_t targetCapacity; 3415 int32_t *offsets; 3416 3417 const uint16_t *table; 3418 const uint16_t *results; 3419 3420 UChar32 c; 3421 3422 int32_t sourceIndex, nextSourceIndex; 3423 3424 uint16_t value, minValue; 3425 UBool hasSupplementary; 3426 3427 /* set up the local pointers */ 3428 cnv=pArgs->converter; 3429 source=pArgs->source; 3430 sourceLimit=pArgs->sourceLimit; 3431 target=(uint8_t *)pArgs->target; 3432 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3433 offsets=pArgs->offsets; 3434 3435 table=cnv->sharedData->mbcs.fromUnicodeTable; 3436 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3437 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3438 } else { 3439 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3440 } 3441 3442 if(cnv->useFallback) { 3443 /* use all roundtrip and fallback results */ 3444 minValue=0x800; 3445 } else { 3446 /* use only roundtrips and fallbacks from private-use characters */ 3447 minValue=0xc00; 3448 } 3449 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3450 3451 /* get the converter state from UConverter */ 3452 c=cnv->fromUChar32; 3453 3454 /* sourceIndex=-1 if the current character began in the previous buffer */ 3455 sourceIndex= c==0 ? 0 : -1; 3456 nextSourceIndex=0; 3457 3458 /* conversion loop */ 3459 if(c!=0 && targetCapacity>0) { 3460 goto getTrail; 3461 } 3462 3463 while(source<sourceLimit) { 3464 /* 3465 * This following test is to see if available input would overflow the output. 3466 * It does not catch output of more than one byte that 3467 * overflows as a result of a multi-byte character or callback output 3468 * from the last source character. 3469 * Therefore, those situations also test for overflows and will 3470 * then break the loop, too. 3471 */ 3472 if(targetCapacity>0) { 3473 /* 3474 * Get a correct Unicode code point: 3475 * a single UChar for a BMP code point or 3476 * a matched surrogate pair for a "supplementary code point". 3477 */ 3478 c=*source++; 3479 ++nextSourceIndex; 3480 if(UTF_IS_SURROGATE(c)) { 3481 if(UTF_IS_SURROGATE_FIRST(c)) { 3482getTrail: 3483 if(source<sourceLimit) { 3484 /* test the following code unit */ 3485 UChar trail=*source; 3486 if(UTF_IS_SECOND_SURROGATE(trail)) { 3487 ++source; 3488 ++nextSourceIndex; 3489 c=UTF16_GET_PAIR_VALUE(c, trail); 3490 if(!hasSupplementary) { 3491 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3492 /* callback(unassigned) */ 3493 goto unassigned; 3494 } 3495 /* convert this supplementary code point */ 3496 /* exit this condition tree */ 3497 } else { 3498 /* this is an unmatched lead code unit (1st surrogate) */ 3499 /* callback(illegal) */ 3500 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3501 break; 3502 } 3503 } else { 3504 /* no more input */ 3505 break; 3506 } 3507 } else { 3508 /* this is an unmatched trail code unit (2nd surrogate) */ 3509 /* callback(illegal) */ 3510 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3511 break; 3512 } 3513 } 3514 3515 /* convert the Unicode code point in c into codepage bytes */ 3516 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3517 3518 /* is this code point assigned, or do we use fallbacks? */ 3519 if(value>=minValue) { 3520 /* assigned, write the output character bytes from value and length */ 3521 /* length==1 */ 3522 /* this is easy because we know that there is enough space */ 3523 *target++=(uint8_t)value; 3524 if(offsets!=NULL) { 3525 *offsets++=sourceIndex; 3526 } 3527 --targetCapacity; 3528 3529 /* normal end of conversion: prepare for a new character */ 3530 c=0; 3531 sourceIndex=nextSourceIndex; 3532 } else { /* unassigned */ 3533unassigned: 3534 /* try an extension mapping */ 3535 pArgs->source=source; 3536 c=_extFromU(cnv, cnv->sharedData, 3537 c, &source, sourceLimit, 3538 &target, target+targetCapacity, 3539 &offsets, sourceIndex, 3540 pArgs->flush, 3541 pErrorCode); 3542 nextSourceIndex+=(int32_t)(source-pArgs->source); 3543 3544 if(U_FAILURE(*pErrorCode)) { 3545 /* not mappable or buffer overflow */ 3546 break; 3547 } else { 3548 /* a mapping was written to the target, continue */ 3549 3550 /* recalculate the targetCapacity after an extension mapping */ 3551 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3552 3553 /* normal end of conversion: prepare for a new character */ 3554 sourceIndex=nextSourceIndex; 3555 } 3556 } 3557 } else { 3558 /* target is full */ 3559 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3560 break; 3561 } 3562 } 3563 3564 /* set the converter state back into UConverter */ 3565 cnv->fromUChar32=c; 3566 3567 /* write back the updated pointers */ 3568 pArgs->source=source; 3569 pArgs->target=(char *)target; 3570 pArgs->offsets=offsets; 3571} 3572 3573/* 3574 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3575 * that map only to and from the BMP. 3576 * In addition to single-byte/state optimizations, the offset calculations 3577 * become much easier. 3578 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3579 * but measurements have shown that this diminishes performance 3580 * in more cases than it improves it. 3581 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3582 * for various MBCS and SBCS optimizations. 3583 */ 3584static void 3585ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3586 UErrorCode *pErrorCode) { 3587 UConverter *cnv; 3588 const UChar *source, *sourceLimit, *lastSource; 3589 uint8_t *target; 3590 int32_t targetCapacity, length; 3591 int32_t *offsets; 3592 3593 const uint16_t *table; 3594 const uint16_t *results; 3595 3596 UChar32 c; 3597 3598 int32_t sourceIndex; 3599 3600 uint32_t asciiRoundtrips; 3601 uint16_t value, minValue; 3602 3603 /* set up the local pointers */ 3604 cnv=pArgs->converter; 3605 source=pArgs->source; 3606 sourceLimit=pArgs->sourceLimit; 3607 target=(uint8_t *)pArgs->target; 3608 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3609 offsets=pArgs->offsets; 3610 3611 table=cnv->sharedData->mbcs.fromUnicodeTable; 3612 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3613 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3614 } else { 3615 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3616 } 3617 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3618 3619 if(cnv->useFallback) { 3620 /* use all roundtrip and fallback results */ 3621 minValue=0x800; 3622 } else { 3623 /* use only roundtrips and fallbacks from private-use characters */ 3624 minValue=0xc00; 3625 } 3626 3627 /* get the converter state from UConverter */ 3628 c=cnv->fromUChar32; 3629 3630 /* sourceIndex=-1 if the current character began in the previous buffer */ 3631 sourceIndex= c==0 ? 0 : -1; 3632 lastSource=source; 3633 3634 /* 3635 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3636 * for the minimum of the sourceLength and targetCapacity 3637 */ 3638 length=(int32_t)(sourceLimit-source); 3639 if(length<targetCapacity) { 3640 targetCapacity=length; 3641 } 3642 3643 /* conversion loop */ 3644 if(c!=0 && targetCapacity>0) { 3645 goto getTrail; 3646 } 3647 3648#if MBCS_UNROLL_SINGLE_FROM_BMP 3649 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3650 /* unroll the loop with the most common case */ 3651unrolled: 3652 if(targetCapacity>=4) { 3653 int32_t count, loops; 3654 uint16_t andedValues; 3655 3656 loops=count=targetCapacity>>2; 3657 do { 3658 c=*source++; 3659 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3660 *target++=(uint8_t)value; 3661 c=*source++; 3662 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3663 *target++=(uint8_t)value; 3664 c=*source++; 3665 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3666 *target++=(uint8_t)value; 3667 c=*source++; 3668 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3669 *target++=(uint8_t)value; 3670 3671 /* were all 4 entries really valid? */ 3672 if(andedValues<minValue) { 3673 /* no, return to the first of these 4 */ 3674 source-=4; 3675 target-=4; 3676 break; 3677 } 3678 } while(--count>0); 3679 count=loops-count; 3680 targetCapacity-=4*count; 3681 3682 if(offsets!=NULL) { 3683 lastSource+=4*count; 3684 while(count>0) { 3685 *offsets++=sourceIndex++; 3686 *offsets++=sourceIndex++; 3687 *offsets++=sourceIndex++; 3688 *offsets++=sourceIndex++; 3689 --count; 3690 } 3691 } 3692 3693 c=0; 3694 } 3695#endif 3696 3697 while(targetCapacity>0) { 3698 /* 3699 * Get a correct Unicode code point: 3700 * a single UChar for a BMP code point or 3701 * a matched surrogate pair for a "supplementary code point". 3702 */ 3703 c=*source++; 3704 /* 3705 * Do not immediately check for single surrogates: 3706 * Assume that they are unassigned and check for them in that case. 3707 * This speeds up the conversion of assigned characters. 3708 */ 3709 /* convert the Unicode code point in c into codepage bytes */ 3710 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3711 *target++=(uint8_t)c; 3712 --targetCapacity; 3713 c=0; 3714 continue; 3715 } 3716 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3717 /* is this code point assigned, or do we use fallbacks? */ 3718 if(value>=minValue) { 3719 /* assigned, write the output character bytes from value and length */ 3720 /* length==1 */ 3721 /* this is easy because we know that there is enough space */ 3722 *target++=(uint8_t)value; 3723 --targetCapacity; 3724 3725 /* normal end of conversion: prepare for a new character */ 3726 c=0; 3727 continue; 3728 } else if(!UTF_IS_SURROGATE(c)) { 3729 /* normal, unassigned BMP character */ 3730 } else if(UTF_IS_SURROGATE_FIRST(c)) { 3731getTrail: 3732 if(source<sourceLimit) { 3733 /* test the following code unit */ 3734 UChar trail=*source; 3735 if(UTF_IS_SECOND_SURROGATE(trail)) { 3736 ++source; 3737 c=UTF16_GET_PAIR_VALUE(c, trail); 3738 /* this codepage does not map supplementary code points */ 3739 /* callback(unassigned) */ 3740 } else { 3741 /* this is an unmatched lead code unit (1st surrogate) */ 3742 /* callback(illegal) */ 3743 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3744 break; 3745 } 3746 } else { 3747 /* no more input */ 3748 if (pArgs->flush) { 3749 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3750 } 3751 break; 3752 } 3753 } else { 3754 /* this is an unmatched trail code unit (2nd surrogate) */ 3755 /* callback(illegal) */ 3756 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3757 break; 3758 } 3759 3760 /* c does not have a mapping */ 3761 3762 /* get the number of code units for c to correctly advance sourceIndex */ 3763 length=U16_LENGTH(c); 3764 3765 /* set offsets since the start or the last extension */ 3766 if(offsets!=NULL) { 3767 int32_t count=(int32_t)(source-lastSource); 3768 3769 /* do not set the offset for this character */ 3770 count-=length; 3771 3772 while(count>0) { 3773 *offsets++=sourceIndex++; 3774 --count; 3775 } 3776 /* offsets and sourceIndex are now set for the current character */ 3777 } 3778 3779 /* try an extension mapping */ 3780 lastSource=source; 3781 c=_extFromU(cnv, cnv->sharedData, 3782 c, &source, sourceLimit, 3783 &target, (const uint8_t *)(pArgs->targetLimit), 3784 &offsets, sourceIndex, 3785 pArgs->flush, 3786 pErrorCode); 3787 sourceIndex+=length+(int32_t)(source-lastSource); 3788 lastSource=source; 3789 3790 if(U_FAILURE(*pErrorCode)) { 3791 /* not mappable or buffer overflow */ 3792 break; 3793 } else { 3794 /* a mapping was written to the target, continue */ 3795 3796 /* recalculate the targetCapacity after an extension mapping */ 3797 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3798 length=(int32_t)(sourceLimit-source); 3799 if(length<targetCapacity) { 3800 targetCapacity=length; 3801 } 3802 } 3803 3804#if MBCS_UNROLL_SINGLE_FROM_BMP 3805 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3806 goto unrolled; 3807#endif 3808 } 3809 3810 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 3811 /* target is full */ 3812 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3813 } 3814 3815 /* set offsets since the start or the last callback */ 3816 if(offsets!=NULL) { 3817 size_t count=source-lastSource; 3818 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 3819 /* 3820 Caller gave us a partial supplementary character, 3821 which this function couldn't convert in any case. 3822 The callback will handle the offset. 3823 */ 3824 count--; 3825 } 3826 while(count>0) { 3827 *offsets++=sourceIndex++; 3828 --count; 3829 } 3830 } 3831 3832 /* set the converter state back into UConverter */ 3833 cnv->fromUChar32=c; 3834 3835 /* write back the updated pointers */ 3836 pArgs->source=source; 3837 pArgs->target=(char *)target; 3838 pArgs->offsets=offsets; 3839} 3840 3841U_CFUNC void 3842ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3843 UErrorCode *pErrorCode) { 3844 UConverter *cnv; 3845 const UChar *source, *sourceLimit; 3846 uint8_t *target; 3847 int32_t targetCapacity; 3848 int32_t *offsets; 3849 3850 const uint16_t *table; 3851 const uint16_t *mbcsIndex; 3852 const uint8_t *p, *bytes; 3853 uint8_t outputType; 3854 3855 UChar32 c; 3856 3857 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 3858 3859 uint32_t stage2Entry; 3860 uint32_t asciiRoundtrips; 3861 uint32_t value; 3862 int32_t length, prevLength; 3863 uint8_t unicodeMask; 3864 3865 cnv=pArgs->converter; 3866 3867 if(cnv->preFromUFirstCP>=0) { 3868 /* 3869 * pass sourceIndex=-1 because we continue from an earlier buffer 3870 * in the future, this may change with continuous offsets 3871 */ 3872 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 3873 3874 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 3875 return; 3876 } 3877 } 3878 3879 /* use optimized function if possible */ 3880 outputType=cnv->sharedData->mbcs.outputType; 3881 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3882 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3883 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3884 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 3885 } else { 3886 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 3887 } 3888 return; 3889 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 3890 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 3891 return; 3892 } 3893 3894 /* set up the local pointers */ 3895 source=pArgs->source; 3896 sourceLimit=pArgs->sourceLimit; 3897 target=(uint8_t *)pArgs->target; 3898 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3899 offsets=pArgs->offsets; 3900 3901 table=cnv->sharedData->mbcs.fromUnicodeTable; 3902 if(cnv->sharedData->mbcs.utf8Friendly) { 3903 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3904 } else { 3905 mbcsIndex=NULL; 3906 } 3907 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3908 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3909 } else { 3910 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3911 } 3912 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3913 3914 /* get the converter state from UConverter */ 3915 c=cnv->fromUChar32; 3916 3917 if(outputType==MBCS_OUTPUT_2_SISO) { 3918 prevLength=cnv->fromUnicodeStatus; 3919 if(prevLength==0) { 3920 /* set the real value */ 3921 prevLength=1; 3922 } 3923 } else { 3924 /* prevent fromUnicodeStatus from being set to something non-0 */ 3925 prevLength=0; 3926 } 3927 3928 /* sourceIndex=-1 if the current character began in the previous buffer */ 3929 prevSourceIndex=-1; 3930 sourceIndex= c==0 ? 0 : -1; 3931 nextSourceIndex=0; 3932 3933 /* conversion loop */ 3934 /* 3935 * This is another piece of ugly code: 3936 * A goto into the loop if the converter state contains a first surrogate 3937 * from the previous function call. 3938 * It saves me to check in each loop iteration a check of if(c==0) 3939 * and duplicating the trail-surrogate-handling code in the else 3940 * branch of that check. 3941 * I could not find any other way to get around this other than 3942 * using a function call for the conversion and callback, which would 3943 * be even more inefficient. 3944 * 3945 * Markus Scherer 2000-jul-19 3946 */ 3947 if(c!=0 && targetCapacity>0) { 3948 goto getTrail; 3949 } 3950 3951 while(source<sourceLimit) { 3952 /* 3953 * This following test is to see if available input would overflow the output. 3954 * It does not catch output of more than one byte that 3955 * overflows as a result of a multi-byte character or callback output 3956 * from the last source character. 3957 * Therefore, those situations also test for overflows and will 3958 * then break the loop, too. 3959 */ 3960 if(targetCapacity>0) { 3961 /* 3962 * Get a correct Unicode code point: 3963 * a single UChar for a BMP code point or 3964 * a matched surrogate pair for a "supplementary code point". 3965 */ 3966 c=*source++; 3967 ++nextSourceIndex; 3968 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3969 *target++=(uint8_t)c; 3970 if(offsets!=NULL) { 3971 *offsets++=sourceIndex; 3972 prevSourceIndex=sourceIndex; 3973 sourceIndex=nextSourceIndex; 3974 } 3975 --targetCapacity; 3976 c=0; 3977 continue; 3978 } 3979 /* 3980 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3981 * to avoid dealing with surrogates. 3982 * MBCS_FAST_MAX must be >=0xd7ff. 3983 */ 3984 if(c<=0xd7ff && mbcsIndex!=NULL) { 3985 value=mbcsIndex[c>>6]; 3986 3987 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 3988 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3989 switch(outputType) { 3990 case MBCS_OUTPUT_2: 3991 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 3992 if(value<=0xff) { 3993 if(value==0) { 3994 goto unassigned; 3995 } else { 3996 length=1; 3997 } 3998 } else { 3999 length=2; 4000 } 4001 break; 4002 case MBCS_OUTPUT_2_SISO: 4003 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4004 /* 4005 * Save the old state in the converter object 4006 * right here, then change the local prevLength state variable if necessary. 4007 * Then, if this character turns out to be unassigned or a fallback that 4008 * is not taken, the callback code must not save the new state in the converter 4009 * because the new state is for a character that is not output. 4010 * However, the callback must still restore the state from the converter 4011 * in case the callback function changed it for its output. 4012 */ 4013 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4014 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4015 if(value<=0xff) { 4016 if(value==0) { 4017 goto unassigned; 4018 } else if(prevLength<=1) { 4019 length=1; 4020 } else { 4021 /* change from double-byte mode to single-byte */ 4022 value|=(uint32_t)UCNV_SI<<8; 4023 length=2; 4024 prevLength=1; 4025 } 4026 } else { 4027 if(prevLength==2) { 4028 length=2; 4029 } else { 4030 /* change from single-byte mode to double-byte */ 4031 value|=(uint32_t)UCNV_SO<<16; 4032 length=3; 4033 prevLength=2; 4034 } 4035 } 4036 break; 4037 case MBCS_OUTPUT_DBCS_ONLY: 4038 /* table with single-byte results, but only DBCS mappings used */ 4039 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4040 if(value<=0xff) { 4041 /* no mapping or SBCS result, not taken for DBCS-only */ 4042 goto unassigned; 4043 } else { 4044 length=2; 4045 } 4046 break; 4047 case MBCS_OUTPUT_3: 4048 p=bytes+(value+(c&0x3f))*3; 4049 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4050 if(value<=0xff) { 4051 if(value==0) { 4052 goto unassigned; 4053 } else { 4054 length=1; 4055 } 4056 } else if(value<=0xffff) { 4057 length=2; 4058 } else { 4059 length=3; 4060 } 4061 break; 4062 case MBCS_OUTPUT_4: 4063 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4064 if(value<=0xff) { 4065 if(value==0) { 4066 goto unassigned; 4067 } else { 4068 length=1; 4069 } 4070 } else if(value<=0xffff) { 4071 length=2; 4072 } else if(value<=0xffffff) { 4073 length=3; 4074 } else { 4075 length=4; 4076 } 4077 break; 4078 case MBCS_OUTPUT_3_EUC: 4079 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4080 /* EUC 16-bit fixed-length representation */ 4081 if(value<=0xff) { 4082 if(value==0) { 4083 goto unassigned; 4084 } else { 4085 length=1; 4086 } 4087 } else if((value&0x8000)==0) { 4088 value|=0x8e8000; 4089 length=3; 4090 } else if((value&0x80)==0) { 4091 value|=0x8f0080; 4092 length=3; 4093 } else { 4094 length=2; 4095 } 4096 break; 4097 case MBCS_OUTPUT_4_EUC: 4098 p=bytes+(value+(c&0x3f))*3; 4099 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4100 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4101 if(value<=0xff) { 4102 if(value==0) { 4103 goto unassigned; 4104 } else { 4105 length=1; 4106 } 4107 } else if(value<=0xffff) { 4108 length=2; 4109 } else if((value&0x800000)==0) { 4110 value|=0x8e800000; 4111 length=4; 4112 } else if((value&0x8000)==0) { 4113 value|=0x8f008000; 4114 length=4; 4115 } else { 4116 length=3; 4117 } 4118 break; 4119 default: 4120 /* must not occur */ 4121 /* 4122 * To avoid compiler warnings that value & length may be 4123 * used without having been initialized, we set them here. 4124 * In reality, this is unreachable code. 4125 * Not having a default branch also causes warnings with 4126 * some compilers. 4127 */ 4128 value=0; 4129 length=0; 4130 break; 4131 } 4132 /* output the value */ 4133 } else { 4134 /* 4135 * This also tests if the codepage maps single surrogates. 4136 * If it does, then surrogates are not paired but mapped separately. 4137 * Note that in this case unmatched surrogates are not detected. 4138 */ 4139 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4140 if(UTF_IS_SURROGATE_FIRST(c)) { 4141getTrail: 4142 if(source<sourceLimit) { 4143 /* test the following code unit */ 4144 UChar trail=*source; 4145 if(UTF_IS_SECOND_SURROGATE(trail)) { 4146 ++source; 4147 ++nextSourceIndex; 4148 c=UTF16_GET_PAIR_VALUE(c, trail); 4149 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4150 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4151 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4152 /* callback(unassigned) */ 4153 goto unassigned; 4154 } 4155 /* convert this supplementary code point */ 4156 /* exit this condition tree */ 4157 } else { 4158 /* this is an unmatched lead code unit (1st surrogate) */ 4159 /* callback(illegal) */ 4160 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4161 break; 4162 } 4163 } else { 4164 /* no more input */ 4165 break; 4166 } 4167 } else { 4168 /* this is an unmatched trail code unit (2nd surrogate) */ 4169 /* callback(illegal) */ 4170 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4171 break; 4172 } 4173 } 4174 4175 /* convert the Unicode code point in c into codepage bytes */ 4176 4177 /* 4178 * The basic lookup is a triple-stage compact array (trie) lookup. 4179 * For details see the beginning of this file. 4180 * 4181 * Single-byte codepages are handled with a different data structure 4182 * by _MBCSSingle... functions. 4183 * 4184 * The result consists of a 32-bit value from stage 2 and 4185 * a pointer to as many bytes as are stored per character. 4186 * The pointer points to the character's bytes in stage 3. 4187 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4188 * for that pointer, while bits 31..16 are flags for which of 4189 * the 16 characters in the block are roundtrip-assigned. 4190 * 4191 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4192 * respectively as uint32_t, in the platform encoding. 4193 * For 3-byte codepages, the bytes are always stored in big-endian order. 4194 * 4195 * For EUC encodings that use only either 0x8e or 0x8f as the first 4196 * byte of their longest byte sequences, the first two bytes in 4197 * this third stage indicate with their 7th bits whether these bytes 4198 * are to be written directly or actually need to be preceeded by 4199 * one of the two Single-Shift codes. With this, the third stage 4200 * stores one byte fewer per character than the actual maximum length of 4201 * EUC byte sequences. 4202 * 4203 * Other than that, leading zero bytes are removed and the other 4204 * bytes output. A single zero byte may be output if the "assigned" 4205 * bit in stage 2 was on. 4206 * The data structure does not support zero byte output as a fallback, 4207 * and also does not allow output of leading zeros. 4208 */ 4209 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4210 4211 /* get the bytes and the length for the output */ 4212 switch(outputType) { 4213 case MBCS_OUTPUT_2: 4214 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4215 if(value<=0xff) { 4216 length=1; 4217 } else { 4218 length=2; 4219 } 4220 break; 4221 case MBCS_OUTPUT_2_SISO: 4222 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4223 /* 4224 * Save the old state in the converter object 4225 * right here, then change the local prevLength state variable if necessary. 4226 * Then, if this character turns out to be unassigned or a fallback that 4227 * is not taken, the callback code must not save the new state in the converter 4228 * because the new state is for a character that is not output. 4229 * However, the callback must still restore the state from the converter 4230 * in case the callback function changed it for its output. 4231 */ 4232 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4233 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4234 if(value<=0xff) { 4235 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4236 /* no mapping, leave value==0 */ 4237 length=0; 4238 } else if(prevLength<=1) { 4239 length=1; 4240 } else { 4241 /* change from double-byte mode to single-byte */ 4242 value|=(uint32_t)UCNV_SI<<8; 4243 length=2; 4244 prevLength=1; 4245 } 4246 } else { 4247 if(prevLength==2) { 4248 length=2; 4249 } else { 4250 /* change from single-byte mode to double-byte */ 4251 value|=(uint32_t)UCNV_SO<<16; 4252 length=3; 4253 prevLength=2; 4254 } 4255 } 4256 break; 4257 case MBCS_OUTPUT_DBCS_ONLY: 4258 /* table with single-byte results, but only DBCS mappings used */ 4259 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4260 if(value<=0xff) { 4261 /* no mapping or SBCS result, not taken for DBCS-only */ 4262 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4263 length=0; 4264 } else { 4265 length=2; 4266 } 4267 break; 4268 case MBCS_OUTPUT_3: 4269 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4270 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4271 if(value<=0xff) { 4272 length=1; 4273 } else if(value<=0xffff) { 4274 length=2; 4275 } else { 4276 length=3; 4277 } 4278 break; 4279 case MBCS_OUTPUT_4: 4280 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4281 if(value<=0xff) { 4282 length=1; 4283 } else if(value<=0xffff) { 4284 length=2; 4285 } else if(value<=0xffffff) { 4286 length=3; 4287 } else { 4288 length=4; 4289 } 4290 break; 4291 case MBCS_OUTPUT_3_EUC: 4292 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4293 /* EUC 16-bit fixed-length representation */ 4294 if(value<=0xff) { 4295 length=1; 4296 } else if((value&0x8000)==0) { 4297 value|=0x8e8000; 4298 length=3; 4299 } else if((value&0x80)==0) { 4300 value|=0x8f0080; 4301 length=3; 4302 } else { 4303 length=2; 4304 } 4305 break; 4306 case MBCS_OUTPUT_4_EUC: 4307 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4308 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4309 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4310 if(value<=0xff) { 4311 length=1; 4312 } else if(value<=0xffff) { 4313 length=2; 4314 } else if((value&0x800000)==0) { 4315 value|=0x8e800000; 4316 length=4; 4317 } else if((value&0x8000)==0) { 4318 value|=0x8f008000; 4319 length=4; 4320 } else { 4321 length=3; 4322 } 4323 break; 4324 default: 4325 /* must not occur */ 4326 /* 4327 * To avoid compiler warnings that value & length may be 4328 * used without having been initialized, we set them here. 4329 * In reality, this is unreachable code. 4330 * Not having a default branch also causes warnings with 4331 * some compilers. 4332 */ 4333 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4334 length=0; 4335 break; 4336 } 4337 4338 /* is this code point assigned, or do we use fallbacks? */ 4339 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4340 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4341 ) { 4342 /* 4343 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4344 * There is no way with this data structure for fallback output 4345 * to be a zero byte. 4346 */ 4347 4348unassigned: 4349 /* try an extension mapping */ 4350 pArgs->source=source; 4351 c=_extFromU(cnv, cnv->sharedData, 4352 c, &source, sourceLimit, 4353 &target, target+targetCapacity, 4354 &offsets, sourceIndex, 4355 pArgs->flush, 4356 pErrorCode); 4357 nextSourceIndex+=(int32_t)(source-pArgs->source); 4358 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4359 4360 if(U_FAILURE(*pErrorCode)) { 4361 /* not mappable or buffer overflow */ 4362 break; 4363 } else { 4364 /* a mapping was written to the target, continue */ 4365 4366 /* recalculate the targetCapacity after an extension mapping */ 4367 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4368 4369 /* normal end of conversion: prepare for a new character */ 4370 if(offsets!=NULL) { 4371 prevSourceIndex=sourceIndex; 4372 sourceIndex=nextSourceIndex; 4373 } 4374 continue; 4375 } 4376 } 4377 } 4378 4379 /* write the output character bytes from value and length */ 4380 /* from the first if in the loop we know that targetCapacity>0 */ 4381 if(length<=targetCapacity) { 4382 if(offsets==NULL) { 4383 switch(length) { 4384 /* each branch falls through to the next one */ 4385 case 4: 4386 *target++=(uint8_t)(value>>24); 4387 case 3: 4388 *target++=(uint8_t)(value>>16); 4389 case 2: 4390 *target++=(uint8_t)(value>>8); 4391 case 1: 4392 *target++=(uint8_t)value; 4393 default: 4394 /* will never occur */ 4395 break; 4396 } 4397 } else { 4398 switch(length) { 4399 /* each branch falls through to the next one */ 4400 case 4: 4401 *target++=(uint8_t)(value>>24); 4402 *offsets++=sourceIndex; 4403 case 3: 4404 *target++=(uint8_t)(value>>16); 4405 *offsets++=sourceIndex; 4406 case 2: 4407 *target++=(uint8_t)(value>>8); 4408 *offsets++=sourceIndex; 4409 case 1: 4410 *target++=(uint8_t)value; 4411 *offsets++=sourceIndex; 4412 default: 4413 /* will never occur */ 4414 break; 4415 } 4416 } 4417 targetCapacity-=length; 4418 } else { 4419 uint8_t *charErrorBuffer; 4420 4421 /* 4422 * We actually do this backwards here: 4423 * In order to save an intermediate variable, we output 4424 * first to the overflow buffer what does not fit into the 4425 * regular target. 4426 */ 4427 /* we know that 1<=targetCapacity<length<=4 */ 4428 length-=targetCapacity; 4429 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4430 switch(length) { 4431 /* each branch falls through to the next one */ 4432 case 3: 4433 *charErrorBuffer++=(uint8_t)(value>>16); 4434 case 2: 4435 *charErrorBuffer++=(uint8_t)(value>>8); 4436 case 1: 4437 *charErrorBuffer=(uint8_t)value; 4438 default: 4439 /* will never occur */ 4440 break; 4441 } 4442 cnv->charErrorBufferLength=(int8_t)length; 4443 4444 /* now output what fits into the regular target */ 4445 value>>=8*length; /* length was reduced by targetCapacity */ 4446 switch(targetCapacity) { 4447 /* each branch falls through to the next one */ 4448 case 3: 4449 *target++=(uint8_t)(value>>16); 4450 if(offsets!=NULL) { 4451 *offsets++=sourceIndex; 4452 } 4453 case 2: 4454 *target++=(uint8_t)(value>>8); 4455 if(offsets!=NULL) { 4456 *offsets++=sourceIndex; 4457 } 4458 case 1: 4459 *target++=(uint8_t)value; 4460 if(offsets!=NULL) { 4461 *offsets++=sourceIndex; 4462 } 4463 default: 4464 /* will never occur */ 4465 break; 4466 } 4467 4468 /* target overflow */ 4469 targetCapacity=0; 4470 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4471 c=0; 4472 break; 4473 } 4474 4475 /* normal end of conversion: prepare for a new character */ 4476 c=0; 4477 if(offsets!=NULL) { 4478 prevSourceIndex=sourceIndex; 4479 sourceIndex=nextSourceIndex; 4480 } 4481 continue; 4482 } else { 4483 /* target is full */ 4484 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4485 break; 4486 } 4487 } 4488 4489 /* 4490 * the end of the input stream and detection of truncated input 4491 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4492 * we need to emit an SI at the very end 4493 * 4494 * conditions: 4495 * successful 4496 * EBCDIC_STATEFUL in DBCS mode 4497 * end of input and no truncated input 4498 */ 4499 if( U_SUCCESS(*pErrorCode) && 4500 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4501 pArgs->flush && source>=sourceLimit && c==0 4502 ) { 4503 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4504 if(targetCapacity>0) { 4505 *target++=(uint8_t)UCNV_SI; 4506 if(offsets!=NULL) { 4507 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4508 *offsets++=prevSourceIndex; 4509 } 4510 } else { 4511 /* target is full */ 4512 cnv->charErrorBuffer[0]=(char)UCNV_SI; 4513 cnv->charErrorBufferLength=1; 4514 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4515 } 4516 prevLength=1; /* we switched into SBCS */ 4517 } 4518 4519 /* set the converter state back into UConverter */ 4520 cnv->fromUChar32=c; 4521 cnv->fromUnicodeStatus=prevLength; 4522 4523 /* write back the updated pointers */ 4524 pArgs->source=source; 4525 pArgs->target=(char *)target; 4526 pArgs->offsets=offsets; 4527} 4528 4529/* 4530 * This is another simple conversion function for internal use by other 4531 * conversion implementations. 4532 * It does not use the converter state nor call callbacks. 4533 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4534 * It handles conversion extensions but not GB 18030. 4535 * 4536 * It converts one single Unicode code point into codepage bytes, encoded 4537 * as one 32-bit value. The function returns the number of bytes in *pValue: 4538 * 1..4 the number of bytes in *pValue 4539 * 0 unassigned (*pValue undefined) 4540 * -1 illegal (currently not used, *pValue undefined) 4541 * 4542 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4543 * the second to last byte in bits 15..8, etc. 4544 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4545 */ 4546U_CFUNC int32_t 4547ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4548 UChar32 c, uint32_t *pValue, 4549 UBool useFallback) { 4550 const int32_t *cx; 4551 const uint16_t *table; 4552#if 0 4553/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4554 const uint8_t *p; 4555#endif 4556 uint32_t stage2Entry; 4557 uint32_t value; 4558 int32_t length; 4559 4560 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4561 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4562 table=sharedData->mbcs.fromUnicodeTable; 4563 4564 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4565 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4566 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4567 /* is this code point assigned, or do we use fallbacks? */ 4568 if(useFallback ? value>=0x800 : value>=0xc00) { 4569 *pValue=value&0xff; 4570 return 1; 4571 } 4572 } else /* outputType!=MBCS_OUTPUT_1 */ { 4573 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4574 4575 /* get the bytes and the length for the output */ 4576 switch(sharedData->mbcs.outputType) { 4577 case MBCS_OUTPUT_2: 4578 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4579 if(value<=0xff) { 4580 length=1; 4581 } else { 4582 length=2; 4583 } 4584 break; 4585#if 0 4586/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4587 case MBCS_OUTPUT_DBCS_ONLY: 4588 /* table with single-byte results, but only DBCS mappings used */ 4589 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4590 if(value<=0xff) { 4591 /* no mapping or SBCS result, not taken for DBCS-only */ 4592 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4593 length=0; 4594 } else { 4595 length=2; 4596 } 4597 break; 4598 case MBCS_OUTPUT_3: 4599 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4600 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4601 if(value<=0xff) { 4602 length=1; 4603 } else if(value<=0xffff) { 4604 length=2; 4605 } else { 4606 length=3; 4607 } 4608 break; 4609 case MBCS_OUTPUT_4: 4610 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4611 if(value<=0xff) { 4612 length=1; 4613 } else if(value<=0xffff) { 4614 length=2; 4615 } else if(value<=0xffffff) { 4616 length=3; 4617 } else { 4618 length=4; 4619 } 4620 break; 4621 case MBCS_OUTPUT_3_EUC: 4622 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4623 /* EUC 16-bit fixed-length representation */ 4624 if(value<=0xff) { 4625 length=1; 4626 } else if((value&0x8000)==0) { 4627 value|=0x8e8000; 4628 length=3; 4629 } else if((value&0x80)==0) { 4630 value|=0x8f0080; 4631 length=3; 4632 } else { 4633 length=2; 4634 } 4635 break; 4636 case MBCS_OUTPUT_4_EUC: 4637 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4638 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4639 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4640 if(value<=0xff) { 4641 length=1; 4642 } else if(value<=0xffff) { 4643 length=2; 4644 } else if((value&0x800000)==0) { 4645 value|=0x8e800000; 4646 length=4; 4647 } else if((value&0x8000)==0) { 4648 value|=0x8f008000; 4649 length=4; 4650 } else { 4651 length=3; 4652 } 4653 break; 4654#endif 4655 default: 4656 /* must not occur */ 4657 return -1; 4658 } 4659 4660 /* is this code point assigned, or do we use fallbacks? */ 4661 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4662 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4663 ) { 4664 /* 4665 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4666 * There is no way with this data structure for fallback output 4667 * to be a zero byte. 4668 */ 4669 /* assigned */ 4670 *pValue=value; 4671 return length; 4672 } 4673 } 4674 } 4675 4676 cx=sharedData->mbcs.extIndexes; 4677 if(cx!=NULL) { 4678 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4679 return length>=0 ? length : -length; /* return abs(length); */ 4680 } 4681 4682 /* unassigned */ 4683 return 0; 4684} 4685 4686 4687#if 0 4688/* 4689 * This function has been moved to ucnv2022.c for inlining. 4690 * This implementation is here only for documentation purposes 4691 */ 4692 4693/** 4694 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4695 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4696 * It does not handle conversion extensions (_extFromU()). 4697 * 4698 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4699 */ 4700U_CFUNC int32_t 4701ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4702 UChar32 c, 4703 UBool useFallback) { 4704 const uint16_t *table; 4705 int32_t value; 4706 4707 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4708 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4709 return -1; 4710 } 4711 4712 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4713 table=sharedData->mbcs.fromUnicodeTable; 4714 4715 /* get the byte for the output */ 4716 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4717 /* is this code point assigned, or do we use fallbacks? */ 4718 if(useFallback ? value>=0x800 : value>=0xc00) { 4719 return value&0xff; 4720 } else { 4721 return -1; 4722 } 4723} 4724#endif 4725 4726/* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4727 4728/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4729static const UChar32 4730utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4731 4732/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4733static const UChar32 4734utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4735 4736static void 4737ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4738 UConverterToUnicodeArgs *pToUArgs, 4739 UErrorCode *pErrorCode) { 4740 UConverter *utf8, *cnv; 4741 const uint8_t *source, *sourceLimit; 4742 uint8_t *target; 4743 int32_t targetCapacity; 4744 4745 const uint16_t *table, *sbcsIndex; 4746 const uint16_t *results; 4747 4748 int8_t oldToULength, toULength, toULimit; 4749 4750 UChar32 c; 4751 uint8_t b, t1, t2; 4752 4753 uint32_t asciiRoundtrips; 4754 uint16_t value, minValue; 4755 UBool hasSupplementary; 4756 4757 /* set up the local pointers */ 4758 utf8=pToUArgs->converter; 4759 cnv=pFromUArgs->converter; 4760 source=(uint8_t *)pToUArgs->source; 4761 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4762 target=(uint8_t *)pFromUArgs->target; 4763 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4764 4765 table=cnv->sharedData->mbcs.fromUnicodeTable; 4766 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 4767 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4768 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4769 } else { 4770 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4771 } 4772 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4773 4774 if(cnv->useFallback) { 4775 /* use all roundtrip and fallback results */ 4776 minValue=0x800; 4777 } else { 4778 /* use only roundtrips and fallbacks from private-use characters */ 4779 minValue=0xc00; 4780 } 4781 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4782 4783 /* get the converter state from the UTF-8 UConverter */ 4784 c=(UChar32)utf8->toUnicodeStatus; 4785 if(c!=0) { 4786 toULength=oldToULength=utf8->toULength; 4787 toULimit=(int8_t)utf8->mode; 4788 } else { 4789 toULength=oldToULength=toULimit=0; 4790 } 4791 4792 /* 4793 * Make sure that the last byte sequence before sourceLimit is complete 4794 * or runs into a lead byte. 4795 * Do not go back into the bytes that will be read for finishing a partial 4796 * sequence from the previous buffer. 4797 * In the conversion loop compare source with sourceLimit only once 4798 * per multi-byte character. 4799 */ 4800 { 4801 int32_t i, length; 4802 4803 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4804 for(i=0; i<3 && i<length;) { 4805 b=*(sourceLimit-i-1); 4806 if(U8_IS_TRAIL(b)) { 4807 ++i; 4808 } else { 4809 if(i<utf8_countTrailBytes[b]) { 4810 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4811 sourceLimit-=i+1; 4812 } 4813 break; 4814 } 4815 } 4816 } 4817 4818 if(c!=0 && targetCapacity>0) { 4819 utf8->toUnicodeStatus=0; 4820 utf8->toULength=0; 4821 goto moreBytes; 4822 /* 4823 * Note: We could avoid the goto by duplicating some of the moreBytes 4824 * code, but only up to the point of collecting a complete UTF-8 4825 * sequence; then recurse for the toUBytes[toULength] 4826 * and then continue with normal conversion. 4827 * 4828 * If so, move this code to just after initializing the minimum 4829 * set of local variables for reading the UTF-8 input 4830 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 4831 * 4832 * Potential advantages: 4833 * - avoid the goto 4834 * - oldToULength could become a local variable in just those code blocks 4835 * that deal with buffer boundaries 4836 * - possibly faster if the goto prevents some compiler optimizations 4837 * (this would need measuring to confirm) 4838 * Disadvantage: 4839 * - code duplication 4840 */ 4841 } 4842 4843 /* conversion loop */ 4844 while(source<sourceLimit) { 4845 if(targetCapacity>0) { 4846 b=*source++; 4847 if((int8_t)b>=0) { 4848 /* convert ASCII */ 4849 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4850 *target++=(uint8_t)b; 4851 --targetCapacity; 4852 continue; 4853 } else { 4854 c=b; 4855 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 4856 } 4857 } else { 4858 if(b<0xe0) { 4859 if( /* handle U+0080..U+07FF inline */ 4860 b>=0xc2 && 4861 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4862 ) { 4863 c=b&0x1f; 4864 ++source; 4865 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 4866 if(value>=minValue) { 4867 *target++=(uint8_t)value; 4868 --targetCapacity; 4869 continue; 4870 } else { 4871 c=(c<<6)|t1; 4872 } 4873 } else { 4874 c=-1; 4875 } 4876 } else if(b==0xe0) { 4877 if( /* handle U+0800..U+0FFF inline */ 4878 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 4879 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 4880 ) { 4881 c=t1; 4882 source+=2; 4883 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 4884 if(value>=minValue) { 4885 *target++=(uint8_t)value; 4886 --targetCapacity; 4887 continue; 4888 } else { 4889 c=(c<<6)|t2; 4890 } 4891 } else { 4892 c=-1; 4893 } 4894 } else { 4895 c=-1; 4896 } 4897 4898 if(c<0) { 4899 /* handle "complicated" and error cases, and continuing partial characters */ 4900 oldToULength=0; 4901 toULength=1; 4902 toULimit=utf8_countTrailBytes[b]+1; 4903 c=b; 4904moreBytes: 4905 while(toULength<toULimit) { 4906 if(source<sourceLimit) { 4907 b=*source; 4908 if(U8_IS_TRAIL(b)) { 4909 ++source; 4910 ++toULength; 4911 c=(c<<6)+b; 4912 } else { 4913 break; /* sequence too short, stop with toULength<toULimit */ 4914 } 4915 } else { 4916 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 4917 source-=(toULength-oldToULength); 4918 while(oldToULength<toULength) { 4919 utf8->toUBytes[oldToULength++]=*source++; 4920 } 4921 utf8->toUnicodeStatus=c; 4922 utf8->toULength=toULength; 4923 utf8->mode=toULimit; 4924 pToUArgs->source=(char *)source; 4925 pFromUArgs->target=(char *)target; 4926 return; 4927 } 4928 } 4929 4930 if( toULength==toULimit && /* consumed all trail bytes */ 4931 (toULength==3 || toULength==2) && /* BMP */ 4932 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 4933 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 4934 ) { 4935 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4936 } else if( 4937 toULength==toULimit && toULength==4 && 4938 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 4939 ) { 4940 /* supplementary code point */ 4941 if(!hasSupplementary) { 4942 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4943 value=0; 4944 } else { 4945 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4946 } 4947 } else { 4948 /* error handling: illegal UTF-8 byte sequence */ 4949 source-=(toULength-oldToULength); 4950 while(oldToULength<toULength) { 4951 utf8->toUBytes[oldToULength++]=*source++; 4952 } 4953 utf8->toULength=toULength; 4954 pToUArgs->source=(char *)source; 4955 pFromUArgs->target=(char *)target; 4956 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4957 return; 4958 } 4959 } 4960 } 4961 4962 if(value>=minValue) { 4963 /* output the mapping for c */ 4964 *target++=(uint8_t)value; 4965 --targetCapacity; 4966 } else { 4967 /* value<minValue means c is unassigned (unmappable) */ 4968 /* 4969 * Try an extension mapping. 4970 * Pass in no source because we don't have UTF-16 input. 4971 * If we have a partial match on c, we will return and revert 4972 * to UTF-8->UTF-16->charset conversion. 4973 */ 4974 static const UChar nul=0; 4975 const UChar *noSource=&nul; 4976 c=_extFromU(cnv, cnv->sharedData, 4977 c, &noSource, noSource, 4978 &target, target+targetCapacity, 4979 NULL, -1, 4980 pFromUArgs->flush, 4981 pErrorCode); 4982 4983 if(U_FAILURE(*pErrorCode)) { 4984 /* not mappable or buffer overflow */ 4985 cnv->fromUChar32=c; 4986 break; 4987 } else if(cnv->preFromUFirstCP>=0) { 4988 /* 4989 * Partial match, return and revert to pivoting. 4990 * In normal from-UTF-16 conversion, we would just continue 4991 * but then exit the loop because the extension match would 4992 * have consumed the source. 4993 */ 4994 break; 4995 } else { 4996 /* a mapping was written to the target, continue */ 4997 4998 /* recalculate the targetCapacity after an extension mapping */ 4999 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5000 } 5001 } 5002 } else { 5003 /* target is full */ 5004 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5005 break; 5006 } 5007 } 5008 5009 /* 5010 * The sourceLimit may have been adjusted before the conversion loop 5011 * to stop before a truncated sequence. 5012 * If so, then collect the truncated sequence now. 5013 */ 5014 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5015 c=utf8->toUBytes[0]=b=*source++; 5016 toULength=1; 5017 toULimit=utf8_countTrailBytes[b]+1; 5018 while(source<sourceLimit) { 5019 utf8->toUBytes[toULength++]=b=*source++; 5020 c=(c<<6)+b; 5021 } 5022 utf8->toUnicodeStatus=c; 5023 utf8->toULength=toULength; 5024 utf8->mode=toULimit; 5025 } 5026 5027 /* write back the updated pointers */ 5028 pToUArgs->source=(char *)source; 5029 pFromUArgs->target=(char *)target; 5030} 5031 5032static void 5033ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5034 UConverterToUnicodeArgs *pToUArgs, 5035 UErrorCode *pErrorCode) { 5036 UConverter *utf8, *cnv; 5037 const uint8_t *source, *sourceLimit; 5038 uint8_t *target; 5039 int32_t targetCapacity; 5040 5041 const uint16_t *table, *mbcsIndex; 5042 const uint16_t *results; 5043 5044 int8_t oldToULength, toULength, toULimit; 5045 5046 UChar32 c; 5047 uint8_t b, t1, t2; 5048 5049 uint32_t stage2Entry; 5050 uint32_t asciiRoundtrips; 5051 uint16_t value, minValue; 5052 UBool hasSupplementary; 5053 5054 /* set up the local pointers */ 5055 utf8=pToUArgs->converter; 5056 cnv=pFromUArgs->converter; 5057 source=(uint8_t *)pToUArgs->source; 5058 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5059 target=(uint8_t *)pFromUArgs->target; 5060 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5061 5062 table=cnv->sharedData->mbcs.fromUnicodeTable; 5063 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5064 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5065 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5066 } else { 5067 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5068 } 5069 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5070 5071 if(cnv->useFallback) { 5072 /* use all roundtrip and fallback results */ 5073 minValue=0x800; 5074 } else { 5075 /* use only roundtrips and fallbacks from private-use characters */ 5076 minValue=0xc00; 5077 } 5078 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5079 5080 /* get the converter state from the UTF-8 UConverter */ 5081 c=(UChar32)utf8->toUnicodeStatus; 5082 if(c!=0) { 5083 toULength=oldToULength=utf8->toULength; 5084 toULimit=(int8_t)utf8->mode; 5085 } else { 5086 toULength=oldToULength=toULimit=0; 5087 } 5088 5089 /* 5090 * Make sure that the last byte sequence before sourceLimit is complete 5091 * or runs into a lead byte. 5092 * Do not go back into the bytes that will be read for finishing a partial 5093 * sequence from the previous buffer. 5094 * In the conversion loop compare source with sourceLimit only once 5095 * per multi-byte character. 5096 */ 5097 { 5098 int32_t i, length; 5099 5100 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5101 for(i=0; i<3 && i<length;) { 5102 b=*(sourceLimit-i-1); 5103 if(U8_IS_TRAIL(b)) { 5104 ++i; 5105 } else { 5106 if(i<utf8_countTrailBytes[b]) { 5107 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5108 sourceLimit-=i+1; 5109 } 5110 break; 5111 } 5112 } 5113 } 5114 5115 if(c!=0 && targetCapacity>0) { 5116 utf8->toUnicodeStatus=0; 5117 utf8->toULength=0; 5118 goto moreBytes; 5119 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5120 } 5121 5122 /* conversion loop */ 5123 while(source<sourceLimit) { 5124 if(targetCapacity>0) { 5125 b=*source++; 5126 if((int8_t)b>=0) { 5127 /* convert ASCII */ 5128 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5129 *target++=b; 5130 --targetCapacity; 5131 continue; 5132 } else { 5133 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5134 if(value==0) { 5135 c=b; 5136 goto unassigned; 5137 } 5138 } 5139 } else { 5140 if(b>0xe0) { 5141 if( /* handle U+1000..U+D7FF inline */ 5142 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5143 (b==0xed && (t1 <= 0x1f))) && 5144 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5145 ) { 5146 c=((b&0xf)<<6)|t1; 5147 source+=2; 5148 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5149 if(value==0) { 5150 c=(c<<6)|t2; 5151 goto unassigned; 5152 } 5153 } else { 5154 c=-1; 5155 } 5156 } else if(b<0xe0) { 5157 if( /* handle U+0080..U+07FF inline */ 5158 b>=0xc2 && 5159 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5160 ) { 5161 c=b&0x1f; 5162 ++source; 5163 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5164 if(value==0) { 5165 c=(c<<6)|t1; 5166 goto unassigned; 5167 } 5168 } else { 5169 c=-1; 5170 } 5171 } else { 5172 c=-1; 5173 } 5174 5175 if(c<0) { 5176 /* handle "complicated" and error cases, and continuing partial characters */ 5177 oldToULength=0; 5178 toULength=1; 5179 toULimit=utf8_countTrailBytes[b]+1; 5180 c=b; 5181moreBytes: 5182 while(toULength<toULimit) { 5183 if(source<sourceLimit) { 5184 b=*source; 5185 if(U8_IS_TRAIL(b)) { 5186 ++source; 5187 ++toULength; 5188 c=(c<<6)+b; 5189 } else { 5190 break; /* sequence too short, stop with toULength<toULimit */ 5191 } 5192 } else { 5193 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5194 source-=(toULength-oldToULength); 5195 while(oldToULength<toULength) { 5196 utf8->toUBytes[oldToULength++]=*source++; 5197 } 5198 utf8->toUnicodeStatus=c; 5199 utf8->toULength=toULength; 5200 utf8->mode=toULimit; 5201 pToUArgs->source=(char *)source; 5202 pFromUArgs->target=(char *)target; 5203 return; 5204 } 5205 } 5206 5207 if( toULength==toULimit && /* consumed all trail bytes */ 5208 (toULength==3 || toULength==2) && /* BMP */ 5209 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5210 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5211 ) { 5212 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5213 } else if( 5214 toULength==toULimit && toULength==4 && 5215 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5216 ) { 5217 /* supplementary code point */ 5218 if(!hasSupplementary) { 5219 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5220 stage2Entry=0; 5221 } else { 5222 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5223 } 5224 } else { 5225 /* error handling: illegal UTF-8 byte sequence */ 5226 source-=(toULength-oldToULength); 5227 while(oldToULength<toULength) { 5228 utf8->toUBytes[oldToULength++]=*source++; 5229 } 5230 utf8->toULength=toULength; 5231 pToUArgs->source=(char *)source; 5232 pFromUArgs->target=(char *)target; 5233 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5234 return; 5235 } 5236 5237 /* get the bytes and the length for the output */ 5238 /* MBCS_OUTPUT_2 */ 5239 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5240 5241 /* is this code point assigned, or do we use fallbacks? */ 5242 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5243 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5244 ) { 5245 goto unassigned; 5246 } 5247 } 5248 } 5249 5250 /* write the output character bytes from value and length */ 5251 /* from the first if in the loop we know that targetCapacity>0 */ 5252 if(value<=0xff) { 5253 /* this is easy because we know that there is enough space */ 5254 *target++=(uint8_t)value; 5255 --targetCapacity; 5256 } else /* length==2 */ { 5257 *target++=(uint8_t)(value>>8); 5258 if(2<=targetCapacity) { 5259 *target++=(uint8_t)value; 5260 targetCapacity-=2; 5261 } else { 5262 cnv->charErrorBuffer[0]=(char)value; 5263 cnv->charErrorBufferLength=1; 5264 5265 /* target overflow */ 5266 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5267 break; 5268 } 5269 } 5270 continue; 5271 5272unassigned: 5273 { 5274 /* 5275 * Try an extension mapping. 5276 * Pass in no source because we don't have UTF-16 input. 5277 * If we have a partial match on c, we will return and revert 5278 * to UTF-8->UTF-16->charset conversion. 5279 */ 5280 static const UChar nul=0; 5281 const UChar *noSource=&nul; 5282 c=_extFromU(cnv, cnv->sharedData, 5283 c, &noSource, noSource, 5284 &target, target+targetCapacity, 5285 NULL, -1, 5286 pFromUArgs->flush, 5287 pErrorCode); 5288 5289 if(U_FAILURE(*pErrorCode)) { 5290 /* not mappable or buffer overflow */ 5291 cnv->fromUChar32=c; 5292 break; 5293 } else if(cnv->preFromUFirstCP>=0) { 5294 /* 5295 * Partial match, return and revert to pivoting. 5296 * In normal from-UTF-16 conversion, we would just continue 5297 * but then exit the loop because the extension match would 5298 * have consumed the source. 5299 */ 5300 break; 5301 } else { 5302 /* a mapping was written to the target, continue */ 5303 5304 /* recalculate the targetCapacity after an extension mapping */ 5305 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5306 continue; 5307 } 5308 } 5309 } else { 5310 /* target is full */ 5311 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5312 break; 5313 } 5314 } 5315 5316 /* 5317 * The sourceLimit may have been adjusted before the conversion loop 5318 * to stop before a truncated sequence. 5319 * If so, then collect the truncated sequence now. 5320 */ 5321 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5322 c=utf8->toUBytes[0]=b=*source++; 5323 toULength=1; 5324 toULimit=utf8_countTrailBytes[b]+1; 5325 while(source<sourceLimit) { 5326 utf8->toUBytes[toULength++]=b=*source++; 5327 c=(c<<6)+b; 5328 } 5329 utf8->toUnicodeStatus=c; 5330 utf8->toULength=toULength; 5331 utf8->mode=toULimit; 5332 } 5333 5334 /* write back the updated pointers */ 5335 pToUArgs->source=(char *)source; 5336 pFromUArgs->target=(char *)target; 5337} 5338 5339/* miscellaneous ------------------------------------------------------------ */ 5340 5341static void 5342ucnv_MBCSGetStarters(const UConverter* cnv, 5343 UBool starters[256], 5344 UErrorCode *pErrorCode) { 5345 const int32_t *state0; 5346 int i; 5347 5348 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5349 for(i=0; i<256; ++i) { 5350 /* all bytes that cause a state transition from state 0 are lead bytes */ 5351 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5352 } 5353} 5354 5355/* 5356 * This is an internal function that allows other converter implementations 5357 * to check whether a byte is a lead byte. 5358 */ 5359U_CFUNC UBool 5360ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5361 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5362} 5363 5364static void 5365ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5366 int32_t offsetIndex, 5367 UErrorCode *pErrorCode) { 5368 UConverter *cnv=pArgs->converter; 5369 char *p, *subchar; 5370 char buffer[4]; 5371 int32_t length; 5372 5373 /* first, select between subChar and subChar1 */ 5374 if( cnv->subChar1!=0 && 5375 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5376 cnv->useSubChar1 : 5377 (cnv->invalidUCharBuffer[0]<=0xff)) 5378 ) { 5379 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5380 subchar=(char *)&cnv->subChar1; 5381 length=1; 5382 } else { 5383 /* select subChar in all other cases */ 5384 subchar=(char *)cnv->subChars; 5385 length=cnv->subCharLen; 5386 } 5387 5388 /* reset the selector for the next code point */ 5389 cnv->useSubChar1=FALSE; 5390 5391 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5392 p=buffer; 5393 5394 /* fromUnicodeStatus contains prevLength */ 5395 switch(length) { 5396 case 1: 5397 if(cnv->fromUnicodeStatus==2) { 5398 /* DBCS mode and SBCS sub char: change to SBCS */ 5399 cnv->fromUnicodeStatus=1; 5400 *p++=UCNV_SI; 5401 } 5402 *p++=subchar[0]; 5403 break; 5404 case 2: 5405 if(cnv->fromUnicodeStatus<=1) { 5406 /* SBCS mode and DBCS sub char: change to DBCS */ 5407 cnv->fromUnicodeStatus=2; 5408 *p++=UCNV_SO; 5409 } 5410 *p++=subchar[0]; 5411 *p++=subchar[1]; 5412 break; 5413 default: 5414 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5415 return; 5416 } 5417 subchar=buffer; 5418 length=(int32_t)(p-buffer); 5419 } 5420 5421 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5422} 5423 5424U_CFUNC UConverterType 5425ucnv_MBCSGetType(const UConverter* converter) { 5426 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5427 if(converter->sharedData->mbcs.countStates==1) { 5428 return (UConverterType)UCNV_SBCS; 5429 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5430 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5431 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5432 return (UConverterType)UCNV_DBCS; 5433 } 5434 return (UConverterType)UCNV_MBCS; 5435} 5436 5437static const UConverterImpl _SBCSUTF8Impl={ 5438 UCNV_MBCS, 5439 5440 ucnv_MBCSLoad, 5441 ucnv_MBCSUnload, 5442 5443 ucnv_MBCSOpen, 5444 NULL, 5445 NULL, 5446 5447 ucnv_MBCSToUnicodeWithOffsets, 5448 ucnv_MBCSToUnicodeWithOffsets, 5449 ucnv_MBCSFromUnicodeWithOffsets, 5450 ucnv_MBCSFromUnicodeWithOffsets, 5451 ucnv_MBCSGetNextUChar, 5452 5453 ucnv_MBCSGetStarters, 5454 ucnv_MBCSGetName, 5455 ucnv_MBCSWriteSub, 5456 NULL, 5457 ucnv_MBCSGetUnicodeSet, 5458 5459 NULL, 5460 ucnv_SBCSFromUTF8 5461}; 5462 5463static const UConverterImpl _DBCSUTF8Impl={ 5464 UCNV_MBCS, 5465 5466 ucnv_MBCSLoad, 5467 ucnv_MBCSUnload, 5468 5469 ucnv_MBCSOpen, 5470 NULL, 5471 NULL, 5472 5473 ucnv_MBCSToUnicodeWithOffsets, 5474 ucnv_MBCSToUnicodeWithOffsets, 5475 ucnv_MBCSFromUnicodeWithOffsets, 5476 ucnv_MBCSFromUnicodeWithOffsets, 5477 ucnv_MBCSGetNextUChar, 5478 5479 ucnv_MBCSGetStarters, 5480 ucnv_MBCSGetName, 5481 ucnv_MBCSWriteSub, 5482 NULL, 5483 ucnv_MBCSGetUnicodeSet, 5484 5485 NULL, 5486 ucnv_DBCSFromUTF8 5487}; 5488 5489static const UConverterImpl _MBCSImpl={ 5490 UCNV_MBCS, 5491 5492 ucnv_MBCSLoad, 5493 ucnv_MBCSUnload, 5494 5495 ucnv_MBCSOpen, 5496 NULL, 5497 NULL, 5498 5499 ucnv_MBCSToUnicodeWithOffsets, 5500 ucnv_MBCSToUnicodeWithOffsets, 5501 ucnv_MBCSFromUnicodeWithOffsets, 5502 ucnv_MBCSFromUnicodeWithOffsets, 5503 ucnv_MBCSGetNextUChar, 5504 5505 ucnv_MBCSGetStarters, 5506 ucnv_MBCSGetName, 5507 ucnv_MBCSWriteSub, 5508 NULL, 5509 ucnv_MBCSGetUnicodeSet 5510}; 5511 5512 5513/* Static data is in tools/makeconv/ucnvstat.c for data-based 5514 * converters. Be sure to update it as well. 5515 */ 5516 5517const UConverterSharedData _MBCSData={ 5518 sizeof(UConverterSharedData), 1, 5519 NULL, NULL, NULL, FALSE, &_MBCSImpl, 5520 0 5521}; 5522 5523#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5524