1/* 2****************************************************************************** 3* 4* Copyright (C) 2000-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* file name: ucnvmbcs.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2000jul03 14* created by: Markus W. Scherer 15* 16* The current code in this file replaces the previous implementation 17* of conversion code from multi-byte codepages to Unicode and back. 18* This implementation supports the following: 19* - legacy variable-length codepages with up to 4 bytes per character 20* - all Unicode code points (up to 0x10ffff) 21* - efficient distinction of unassigned vs. illegal byte sequences 22* - it is possible in fromUnicode() to directly deal with simple 23* stateful encodings (used for EBCDIC_STATEFUL) 24* - it is possible to convert Unicode code points 25* to a single zero byte (but not as a fallback except for SBCS) 26* 27* Remaining limitations in fromUnicode: 28* - byte sequences must not have leading zero bytes 29* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30* - limitation to up to 4 bytes per character 31* 32* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33* limitations and adds m:n character mappings and other features. 34* See ucnv_ext.h for details. 35* 36* Change history: 37* 38* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40* macros to ucnvmbcs.h file 41*/ 42 43#include "unicode/utypes.h" 44 45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47#include "unicode/ucnv.h" 48#include "unicode/ucnv_cb.h" 49#include "unicode/udata.h" 50#include "unicode/uset.h" 51#include "ucnv_bld.h" 52#include "ucnvmbcs.h" 53#include "ucnv_ext.h" 54#include "ucnv_cnv.h" 55#include "umutex.h" 56#include "cmemory.h" 57#include "cstring.h" 58 59/* control optimizations according to the platform */ 60#define MBCS_UNROLL_SINGLE_TO_BMP 1 61#define MBCS_UNROLL_SINGLE_FROM_BMP 0 62 63/* 64 * _MBCSHeader versions 5.3 & 4.3 65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 66 * 67 * This version is optional. Version 5 is used for incompatible data format changes. 68 * makeconv will continue to generate version 4 files if possible. 69 * 70 * Changes from version 4: 71 * 72 * The main difference is an additional _MBCSHeader field with 73 * - the length (number of uint32_t) of the _MBCSHeader 74 * - flags for further incompatible data format changes 75 * - flags for further, backward compatible data format changes 76 * 77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 78 * the file and needs to be reconstituted at load time. 79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 81 * (For details about these structures see below, and see ucnvmbcs.h.) 82 * 83 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 84 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 85 * precision markers for all mappings.) 86 * 87 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 88 * omitted data that can be reconstituted from the toUnicode data. 89 * 90 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 91 * With only roundtrip mappings in the base fromUnicode data, this part is fully 92 * redundant with the mbcsIndex and will be reconstituted from that (also using the 93 * stage 1 table which contains the information about how stage 2 was compacted). 94 * 95 * The rest of the stage 2 table, the part for code points above maxFastUChar, 96 * is stored in the file and will be appended to the reconstituted part. 97 * 98 * The entire fromUBytes array is omitted from the file and will be reconstitued. 99 * This is done by enumerating all toUnicode roundtrip mappings, performing 100 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 101 * writing instead of reading the byte values. 102 * 103 * _MBCSHeader version 4.3 104 * 105 * Change from version 4.2: 106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 107 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 108 * files which can be used instead of stages 1 & 2. 109 * Faster lookups for roundtrips from most commonly used characters, 110 * and lookups from UTF-8 byte sequences with a natural bit distribution. 111 * See ucnvmbcs.h for more details. 112 * 113 * Change from version 4.1: 114 * - Added an optional extension table structure at the end of the .cnv file. 115 * It is present if the upper bits of the header flags field contains a non-zero 116 * byte offset to it. 117 * Files that contain only a conversion table and no base table 118 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 119 * These contain the base table name between the MBCS header and the extension 120 * data. 121 * 122 * Change from version 4.0: 123 * - Replace header.reserved with header.fromUBytesLength so that all 124 * fields in the data have length. 125 * 126 * Changes from version 3 (for performance improvements): 127 * - new bit distribution for state table entries 128 * - reordered action codes 129 * - new data structure for single-byte fromUnicode 130 * + stage 2 only contains indexes 131 * + stage 3 stores 16 bits per character with classification bits 15..8 132 * - no multiplier for stage 1 entries 133 * - stage 2 for non-single-byte codepages contains the index and the flags in 134 * one 32-bit value 135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 136 * 137 * For more details about old versions of the MBCS data structure, see 138 * the corresponding versions of this file. 139 * 140 * Converting stateless codepage data ---------------------------------------*** 141 * (or codepage data with simple states) to Unicode. 142 * 143 * Data structure and algorithm for converting from complex legacy codepages 144 * to Unicode. (Designed before 2000-may-22.) 145 * 146 * The basic idea is that the structure of legacy codepages can be described 147 * with state tables. 148 * When reading a byte stream, each input byte causes a state transition. 149 * Some transitions result in the output of a code point, some result in 150 * "unassigned" or "illegal" output. 151 * This is used here for character conversion. 152 * 153 * The data structure begins with a state table consisting of a row 154 * per state, with 256 entries (columns) per row for each possible input 155 * byte value. 156 * Each entry is 32 bits wide, with two formats distinguished by 157 * the sign bit (bit 31): 158 * 159 * One format for transitional entries (bit 31 not set) for non-final bytes, and 160 * one format for final entries (bit 31 set). 161 * Both formats contain the number of the next state in the same bit 162 * positions. 163 * State 0 is the initial state. 164 * 165 * Most of the time, the offset values of subsequent states are added 166 * up to a scalar value. This value will eventually be the index of 167 * the Unicode code point in a table that follows the state table. 168 * The effect is that the code points for final state table rows 169 * are contiguous. The code points of final state rows follow each other 170 * in the order of the references to those final states by previous 171 * states, etc. 172 * 173 * For some terminal states, the offset is itself the output Unicode 174 * code point (16 bits for a BMP code point or 20 bits for a supplementary 175 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 176 * For others, the code point in the Unicode table is stored with either 177 * one or two code units: one for BMP code points, two for a pair of 178 * surrogates. 179 * All code points for a final state entry take up the same number of code 180 * units, regardless of whether they all actually _use_ the same number 181 * of code units. This is necessary for simple array access. 182 * 183 * An additional feature comes in with what in ICU is called "fallback" 184 * mappings: 185 * 186 * In addition to round-trippable, precise, 1:1 mappings, there are often 187 * mappings defined between similar, though not the same, characters. 188 * Typically, such mappings occur only in fromUnicode mapping tables because 189 * Unicode has a superset repertoire of most other codepages. However, it 190 * is possible to provide such mappings in the toUnicode tables, too. 191 * In this case, the fallback mappings are partly integrated into the 192 * general state tables because the structure of the encoding includes their 193 * byte sequences. 194 * For final entries in an initial state, fallback mappings are stored in 195 * the entry itself like with roundtrip mappings. 196 * For other final entries, they are stored in the code units table if 197 * the entry is for a pair of code units. 198 * For single-unit results in the code units table, there is no space to 199 * alternatively hold a fallback mapping; in this case, the code unit 200 * is stored as U+fffe (unassigned), and the fallback mapping needs to 201 * be looked up by the scalar offset value in a separate table. 202 * 203 * "Unassigned" state entries really mean "structurally unassigned", 204 * i.e., such a byte sequence will never have a mapping result. 205 * 206 * The interpretation of the bits in each entry is as follows: 207 * 208 * Bit 31 not set, not a terminal entry ("transitional"): 209 * 30..24 next state 210 * 23..0 offset delta, to be added up 211 * 212 * Bit 31 set, terminal ("final") entry: 213 * 30..24 next state (regardless of action code) 214 * 23..20 action code: 215 * action codes 0 and 1 result in precise-mapping Unicode code points 216 * 0 valid byte sequence 217 * 19..16 not used, 0 218 * 15..0 16-bit Unicode BMP code point 219 * never U+fffe or U+ffff 220 * 1 valid byte sequence 221 * 19..0 20-bit Unicode supplementary code point 222 * never U+fffe or U+ffff 223 * 224 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 225 * 2 valid byte sequence (fallback) 226 * 19..16 not used, 0 227 * 15..0 16-bit Unicode BMP code point as fallback result 228 * 3 valid byte sequence (fallback) 229 * 19..0 20-bit Unicode supplementary code point as fallback result 230 * 231 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 232 * depending on the code units they result in 233 * 4 valid byte sequence 234 * 19..9 not used, 0 235 * 8..0 final offset delta 236 * pointing to one 16-bit code unit which may be 237 * fffe unassigned -- look for a fallback for this offset 238 * ffff illegal 239 * 5 valid byte sequence 240 * 19..9 not used, 0 241 * 8..0 final offset delta 242 * pointing to two 16-bit code units 243 * (typically UTF-16 surrogates) 244 * the result depends on the first code unit as follows: 245 * 0000..d7ff roundtrip BMP code point (1st alone) 246 * d800..dbff roundtrip surrogate pair (1st, 2nd) 247 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 248 * e000 roundtrip BMP code point (2nd alone) 249 * e001 fallback BMP code point (2nd alone) 250 * fffe unassigned 251 * ffff illegal 252 * (the final offset deltas are at most 255 * 2, 253 * times 2 because of storing code unit pairs) 254 * 255 * 6 unassigned byte sequence 256 * 19..16 not used, 0 257 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 258 * this does not contain a final offset delta because the main 259 * purpose of this action code is to save scalar offset values; 260 * therefore, fallback values cannot be assigned to byte 261 * sequences that result in this action code 262 * 7 illegal byte sequence 263 * 19..16 not used, 0 264 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 265 * 8 state change only 266 * 19..0 not used, 0 267 * useful for state changes in simple stateful encodings, 268 * at Shift-In/Shift-Out codes 269 * 270 * 271 * 9..15 reserved for future use 272 * current implementations will only perform a state change 273 * and ignore bits 19..0 274 * 275 * An encoding with contiguous ranges of unassigned byte sequences, like 276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 277 * at least two states for the trail bytes: 278 * One trail byte state that results in code points, and one that only 279 * has "unassigned" and "illegal" terminal states. 280 * 281 * Note: partly by accident, this data structure supports simple stateful 282 * encodings without any additional logic. 283 * Currently, only simple Shift-In/Shift-Out schemes are handled with 284 * appropriate state tables (especially EBCDIC_STATEFUL!). 285 * 286 * MBCS version 2 added: 287 * unassigned and illegal action codes have U+fffe and U+ffff 288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 289 * 290 * Converting from Unicode to codepage bytes --------------------------------*** 291 * 292 * The conversion data structure for fromUnicode is designed for the known 293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 295 * a roundtrip mapping. 296 * 297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 298 * like in the character properties table. 299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 300 * with the resulting bytes is at offsetFromUBytes. 301 * 302 * Beginning with version 4, single-byte codepages have a significantly different 303 * trie compared to other codepages. 304 * In all cases, the entry in stage 1 is directly the index of the block of 305 * 64 entries in stage 2. 306 * 307 * Single-byte lookup: 308 * 309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 310 * Stage 3 contains one 16-bit word per result: 311 * Bits 15..8 indicate the kind of result: 312 * f roundtrip result 313 * c fallback result from private-use code point 314 * 8 fallback result from other code points 315 * 0 unassigned 316 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 317 * 318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 321 * ASCII code points can be looked up with a linear array access into stage 3. 322 * See maxFastUChar and other details in ucnvmbcs.h. 323 * 324 * Multi-byte lookup: 325 * 326 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 328 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 329 * If this test is false, then a non-zero result will be interpreted as 330 * a fallback mapping. 331 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 332 * 333 * Stage 3 contains 2, 3, or 4 bytes per result. 334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 335 * while 3 bytes are stored as bytes in big-endian order. 336 * Leading zero bytes are ignored, and the number of bytes is counted. 337 * A zero byte mapping result is possible as a roundtrip result. 338 * For some output types, the actual result is processed from this; 339 * see ucnv_MBCSFromUnicodeWithOffsets(). 340 * 341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 343 * 344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 347 * ASCII code points can be looked up with a linear array access into stage 3. 348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 349 * 350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 351 * for compaction. 352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 353 * may overlap by any number of entries. 354 * 355 * MBCS version 2 added: 356 * the converter checks for known output types, which allows 357 * adding new ones without crashing an unaware converter 358 */ 359 360static const UConverterImpl _SBCSUTF8Impl; 361static const UConverterImpl _DBCSUTF8Impl; 362 363/* GB 18030 data ------------------------------------------------------------ */ 364 365/* helper macros for linear values for GB 18030 four-byte sequences */ 366#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 367 368#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 369 370#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 371 372/* 373 * Some ranges of GB 18030 where both the Unicode code points and the 374 * GB four-byte sequences are contiguous and are handled algorithmically by 375 * the special callback functions below. 376 * The values are start & end of Unicode & GB codes. 377 * 378 * Note that single surrogates are not mapped by GB 18030 379 * as of the re-released mapping tables from 2000-nov-30. 380 */ 381static const uint32_t 382gb18030Ranges[14][4]={ 383 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 384 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 385 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 386 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 387 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 388 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 389 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 390 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 391 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 392 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 393 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 394 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 395 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 396 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 397}; 398 399/* bit flag for UConverter.options indicating GB 18030 special handling */ 400#define _MBCS_OPTION_GB18030 0x8000 401 402/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 403#define _MBCS_OPTION_KEIS 0x01000 404#define _MBCS_OPTION_JEF 0x02000 405#define _MBCS_OPTION_JIPS 0x04000 406 407#define KEIS_SO_CHAR_1 0x0A 408#define KEIS_SO_CHAR_2 0x42 409#define KEIS_SI_CHAR_1 0x0A 410#define KEIS_SI_CHAR_2 0x41 411 412#define JEF_SO_CHAR 0x28 413#define JEF_SI_CHAR 0x29 414 415#define JIPS_SO_CHAR_1 0x1A 416#define JIPS_SO_CHAR_2 0x70 417#define JIPS_SI_CHAR_1 0x1A 418#define JIPS_SI_CHAR_2 0x71 419 420enum SISO_Option { 421 SI, 422 SO 423}; 424typedef enum SISO_Option SISO_Option; 425 426static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 427 int32_t SISOLength = 0; 428 429 switch (option) { 430 case SI: 431 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 432 value[0] = KEIS_SI_CHAR_1; 433 value[1] = KEIS_SI_CHAR_2; 434 SISOLength = 2; 435 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 436 value[0] = JEF_SI_CHAR; 437 SISOLength = 1; 438 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 439 value[0] = JIPS_SI_CHAR_1; 440 value[1] = JIPS_SI_CHAR_2; 441 SISOLength = 2; 442 } else { 443 value[0] = UCNV_SI; 444 SISOLength = 1; 445 } 446 break; 447 case SO: 448 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 449 value[0] = KEIS_SO_CHAR_1; 450 value[1] = KEIS_SO_CHAR_2; 451 SISOLength = 2; 452 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 453 value[0] = JEF_SO_CHAR; 454 SISOLength = 1; 455 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 456 value[0] = JIPS_SO_CHAR_1; 457 value[1] = JIPS_SO_CHAR_2; 458 SISOLength = 2; 459 } else { 460 value[0] = UCNV_SO; 461 SISOLength = 1; 462 } 463 break; 464 default: 465 /* Should never happen. */ 466 break; 467 } 468 469 return SISOLength; 470} 471 472/* Miscellaneous ------------------------------------------------------------ */ 473 474/** 475 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 476 * consecutive sequences of bytes, starting from the one encoded in value, 477 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 478 * Does not currently support m:n mappings or reverse fallbacks. 479 * This function will not be called for sequences of bytes with leading zeros. 480 * 481 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 482 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 483 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 484 * not map to anything 485 * @return TRUE to continue enumeration, FALSE to stop 486 */ 487typedef UBool U_CALLCONV 488UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 489 490/* similar to ucnv_MBCSGetNextUChar() but recursive */ 491static UBool 492enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 493 int32_t state, uint32_t offset, 494 uint32_t value, 495 UConverterEnumToUCallback *callback, const void *context, 496 UErrorCode *pErrorCode) { 497 UChar32 codePoints[32]; 498 const int32_t *row; 499 const uint16_t *unicodeCodeUnits; 500 UChar32 anyCodePoints; 501 int32_t b, limit; 502 503 row=mbcsTable->stateTable[state]; 504 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 505 506 value<<=8; 507 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 508 509 b=(stateProps[state]&0x38)<<2; 510 if(b==0 && stateProps[state]>=0x40) { 511 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 512 codePoints[0]=U_SENTINEL; 513 b=1; 514 } 515 limit=((stateProps[state]&7)+1)<<5; 516 while(b<limit) { 517 int32_t entry=row[b]; 518 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 519 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 520 if(stateProps[nextState]>=0) { 521 /* recurse to a state with non-ignorable actions */ 522 if(!enumToU( 523 mbcsTable, stateProps, nextState, 524 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 525 value|(uint32_t)b, 526 callback, context, 527 pErrorCode)) { 528 return FALSE; 529 } 530 } 531 codePoints[b&0x1f]=U_SENTINEL; 532 } else { 533 UChar32 c; 534 int32_t action; 535 536 /* 537 * An if-else-if chain provides more reliable performance for 538 * the most common cases compared to a switch. 539 */ 540 action=MBCS_ENTRY_FINAL_ACTION(entry); 541 if(action==MBCS_STATE_VALID_DIRECT_16) { 542 /* output BMP code point */ 543 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 544 } else if(action==MBCS_STATE_VALID_16) { 545 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 546 c=unicodeCodeUnits[finalOffset]; 547 if(c<0xfffe) { 548 /* output BMP code point */ 549 } else { 550 c=U_SENTINEL; 551 } 552 } else if(action==MBCS_STATE_VALID_16_PAIR) { 553 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 554 c=unicodeCodeUnits[finalOffset++]; 555 if(c<0xd800) { 556 /* output BMP code point below 0xd800 */ 557 } else if(c<=0xdbff) { 558 /* output roundtrip or fallback supplementary code point */ 559 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 560 } else if(c==0xe000) { 561 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 562 c=unicodeCodeUnits[finalOffset]; 563 } else { 564 c=U_SENTINEL; 565 } 566 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 567 /* output supplementary code point */ 568 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 569 } else { 570 c=U_SENTINEL; 571 } 572 573 codePoints[b&0x1f]=c; 574 anyCodePoints&=c; 575 } 576 if(((++b)&0x1f)==0) { 577 if(anyCodePoints>=0) { 578 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 579 return FALSE; 580 } 581 anyCodePoints=-1; 582 } 583 } 584 } 585 return TRUE; 586} 587 588/* 589 * Only called if stateProps[state]==-1. 590 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 591 * MBCS_STATE_CHANGE_ONLY. 592 */ 593static int8_t 594getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 595 const int32_t *row; 596 int32_t min, max, entry, nextState; 597 598 row=stateTable[state]; 599 stateProps[state]=0; 600 601 /* find first non-ignorable state */ 602 for(min=0;; ++min) { 603 entry=row[min]; 604 nextState=MBCS_ENTRY_STATE(entry); 605 if(stateProps[nextState]==-1) { 606 getStateProp(stateTable, stateProps, nextState); 607 } 608 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 609 if(stateProps[nextState]>=0) { 610 break; 611 } 612 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 613 break; 614 } 615 if(min==0xff) { 616 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 617 return stateProps[state]; 618 } 619 } 620 stateProps[state]|=(int8_t)((min>>5)<<3); 621 622 /* find last non-ignorable state */ 623 for(max=0xff; min<max; --max) { 624 entry=row[max]; 625 nextState=MBCS_ENTRY_STATE(entry); 626 if(stateProps[nextState]==-1) { 627 getStateProp(stateTable, stateProps, nextState); 628 } 629 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 630 if(stateProps[nextState]>=0) { 631 break; 632 } 633 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 634 break; 635 } 636 } 637 stateProps[state]|=(int8_t)(max>>5); 638 639 /* recurse further and collect direct-state information */ 640 while(min<=max) { 641 entry=row[min]; 642 nextState=MBCS_ENTRY_STATE(entry); 643 if(stateProps[nextState]==-1) { 644 getStateProp(stateTable, stateProps, nextState); 645 } 646 if(MBCS_ENTRY_IS_FINAL(entry)) { 647 stateProps[nextState]|=0x40; 648 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 649 stateProps[state]|=0x40; 650 } 651 } 652 ++min; 653 } 654 return stateProps[state]; 655} 656 657/* 658 * Internal function enumerating the toUnicode data of an MBCS converter. 659 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 660 * table, but could also be used for a future ucnv_getUnicodeSet() option 661 * that includes reverse fallbacks (after updating this function's implementation). 662 * Currently only handles roundtrip mappings. 663 * Does not currently handle extensions. 664 */ 665static void 666ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 667 UConverterEnumToUCallback *callback, const void *context, 668 UErrorCode *pErrorCode) { 669 /* 670 * Properties for each state, to speed up the enumeration. 671 * Ignorable actions are unassigned/illegal/state-change-only: 672 * They do not lead to mappings. 673 * 674 * Bits 7..6: 675 * 1 direct/initial state (stateful converters have multiple) 676 * 0 non-initial state with transitions or with non-ignorable result actions 677 * -1 final state with only ignorable actions 678 * 679 * Bits 5..3: 680 * The lowest byte value with non-ignorable actions is 681 * value<<5 (rounded down). 682 * 683 * Bits 2..0: 684 * The highest byte value with non-ignorable actions is 685 * (value<<5)&0x1f (rounded up). 686 */ 687 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 688 int32_t state; 689 690 uprv_memset(stateProps, -1, sizeof(stateProps)); 691 692 /* recurse from state 0 and set all stateProps */ 693 getStateProp(mbcsTable->stateTable, stateProps, 0); 694 695 for(state=0; state<mbcsTable->countStates; ++state) { 696 /*if(stateProps[state]==-1) { 697 printf("unused/unreachable <icu:state> %d\n", state); 698 }*/ 699 if(stateProps[state]>=0x40) { 700 /* start from each direct state */ 701 enumToU( 702 mbcsTable, stateProps, state, 0, 0, 703 callback, context, 704 pErrorCode); 705 } 706 } 707} 708 709U_CFUNC void 710ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 711 const USetAdder *sa, 712 UConverterUnicodeSet which, 713 UConverterSetFilter filter, 714 UErrorCode *pErrorCode) { 715 const UConverterMBCSTable *mbcsTable; 716 const uint16_t *table; 717 718 uint32_t st3; 719 uint16_t st1, maxStage1, st2; 720 721 UChar32 c; 722 723 /* enumerate the from-Unicode trie table */ 724 mbcsTable=&sharedData->mbcs; 725 table=mbcsTable->fromUnicodeTable; 726 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 727 maxStage1=0x440; 728 } else { 729 maxStage1=0x40; 730 } 731 732 c=0; /* keep track of the current code point while enumerating */ 733 734 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 735 const uint16_t *stage2, *stage3, *results; 736 uint16_t minValue; 737 738 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 739 740 /* 741 * Set a threshold variable for selecting which mappings to use. 742 * See ucnv_MBCSSingleFromBMPWithOffsets() and 743 * MBCS_SINGLE_RESULT_FROM_U() for details. 744 */ 745 if(which==UCNV_ROUNDTRIP_SET) { 746 /* use only roundtrips */ 747 minValue=0xf00; 748 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 749 /* use all roundtrip and fallback results */ 750 minValue=0x800; 751 } 752 753 for(st1=0; st1<maxStage1; ++st1) { 754 st2=table[st1]; 755 if(st2>maxStage1) { 756 stage2=table+st2; 757 for(st2=0; st2<64; ++st2) { 758 if((st3=stage2[st2])!=0) { 759 /* read the stage 3 block */ 760 stage3=results+st3; 761 762 do { 763 if(*stage3++>=minValue) { 764 sa->add(sa->set, c); 765 } 766 } while((++c&0xf)!=0); 767 } else { 768 c+=16; /* empty stage 3 block */ 769 } 770 } 771 } else { 772 c+=1024; /* empty stage 2 block */ 773 } 774 } 775 } else { 776 const uint32_t *stage2; 777 const uint8_t *stage3, *bytes; 778 uint32_t st3Multiplier; 779 uint32_t value; 780 UBool useFallback; 781 782 bytes=mbcsTable->fromUnicodeBytes; 783 784 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 785 786 switch(mbcsTable->outputType) { 787 case MBCS_OUTPUT_3: 788 case MBCS_OUTPUT_4_EUC: 789 st3Multiplier=3; 790 break; 791 case MBCS_OUTPUT_4: 792 st3Multiplier=4; 793 break; 794 default: 795 st3Multiplier=2; 796 break; 797 } 798 799 for(st1=0; st1<maxStage1; ++st1) { 800 st2=table[st1]; 801 if(st2>(maxStage1>>1)) { 802 stage2=(const uint32_t *)table+st2; 803 for(st2=0; st2<64; ++st2) { 804 if((st3=stage2[st2])!=0) { 805 /* read the stage 3 block */ 806 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 807 808 /* get the roundtrip flags for the stage 3 block */ 809 st3>>=16; 810 811 /* 812 * Add code points for which the roundtrip flag is set, 813 * or which map to non-zero bytes if we use fallbacks. 814 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 815 */ 816 switch(filter) { 817 case UCNV_SET_FILTER_NONE: 818 do { 819 if(st3&1) { 820 sa->add(sa->set, c); 821 stage3+=st3Multiplier; 822 } else if(useFallback) { 823 uint8_t b=0; 824 switch(st3Multiplier) { 825 case 4: 826 b|=*stage3++; 827 case 3: 828 b|=*stage3++; 829 case 2: 830 b|=stage3[0]|stage3[1]; 831 stage3+=2; 832 default: 833 break; 834 } 835 if(b!=0) { 836 sa->add(sa->set, c); 837 } 838 } 839 st3>>=1; 840 } while((++c&0xf)!=0); 841 break; 842 case UCNV_SET_FILTER_DBCS_ONLY: 843 /* Ignore single-byte results (<0x100). */ 844 do { 845 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 846 sa->add(sa->set, c); 847 } 848 st3>>=1; 849 stage3+=2; /* +=st3Multiplier */ 850 } while((++c&0xf)!=0); 851 break; 852 case UCNV_SET_FILTER_2022_CN: 853 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 854 do { 855 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 856 sa->add(sa->set, c); 857 } 858 st3>>=1; 859 stage3+=3; /* +=st3Multiplier */ 860 } while((++c&0xf)!=0); 861 break; 862 case UCNV_SET_FILTER_SJIS: 863 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 864 do { 865 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 866 sa->add(sa->set, c); 867 } 868 st3>>=1; 869 stage3+=2; /* +=st3Multiplier */ 870 } while((++c&0xf)!=0); 871 break; 872 case UCNV_SET_FILTER_GR94DBCS: 873 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 874 do { 875 if( ((st3&1)!=0 || useFallback) && 876 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 877 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 878 ) { 879 sa->add(sa->set, c); 880 } 881 st3>>=1; 882 stage3+=2; /* +=st3Multiplier */ 883 } while((++c&0xf)!=0); 884 break; 885 case UCNV_SET_FILTER_HZ: 886 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 887 do { 888 if( ((st3&1)!=0 || useFallback) && 889 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 890 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 891 ) { 892 sa->add(sa->set, c); 893 } 894 st3>>=1; 895 stage3+=2; /* +=st3Multiplier */ 896 } while((++c&0xf)!=0); 897 break; 898 default: 899 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 900 return; 901 } 902 } else { 903 c+=16; /* empty stage 3 block */ 904 } 905 } 906 } else { 907 c+=1024; /* empty stage 2 block */ 908 } 909 } 910 } 911 912 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 913} 914 915U_CFUNC void 916ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 917 const USetAdder *sa, 918 UConverterUnicodeSet which, 919 UErrorCode *pErrorCode) { 920 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 921 sharedData, sa, which, 922 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 923 UCNV_SET_FILTER_DBCS_ONLY : 924 UCNV_SET_FILTER_NONE, 925 pErrorCode); 926} 927 928static void 929ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 930 const USetAdder *sa, 931 UConverterUnicodeSet which, 932 UErrorCode *pErrorCode) { 933 if(cnv->options&_MBCS_OPTION_GB18030) { 934 sa->addRange(sa->set, 0, 0xd7ff); 935 sa->addRange(sa->set, 0xe000, 0x10ffff); 936 } else { 937 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 938 } 939} 940 941/* conversion extensions for input not in the main table -------------------- */ 942 943/* 944 * Hardcoded extension handling for GB 18030. 945 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 946 * 947 * In the future, conversion extensions may handle m:n mappings and delta tables, 948 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 949 * 950 * If an input character cannot be mapped, then these functions set an error 951 * code. The framework will then call the callback function. 952 */ 953 954/* 955 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 956 * else return 0 after output has been written to the target 957 */ 958static UChar32 959_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 960 UChar32 cp, 961 const UChar **source, const UChar *sourceLimit, 962 uint8_t **target, const uint8_t *targetLimit, 963 int32_t **offsets, int32_t sourceIndex, 964 UBool flush, 965 UErrorCode *pErrorCode) { 966 const int32_t *cx; 967 968 cnv->useSubChar1=FALSE; 969 970 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 971 ucnv_extInitialMatchFromU( 972 cnv, cx, 973 cp, source, sourceLimit, 974 (char **)target, (char *)targetLimit, 975 offsets, sourceIndex, 976 flush, 977 pErrorCode) 978 ) { 979 return 0; /* an extension mapping handled the input */ 980 } 981 982 /* GB 18030 */ 983 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 984 const uint32_t *range; 985 int32_t i; 986 987 range=gb18030Ranges[0]; 988 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 989 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 990 /* found the Unicode code point, output the four-byte sequence for it */ 991 uint32_t linear; 992 char bytes[4]; 993 994 /* get the linear value of the first GB 18030 code in this range */ 995 linear=range[2]-LINEAR_18030_BASE; 996 997 /* add the offset from the beginning of the range */ 998 linear+=((uint32_t)cp-range[0]); 999 1000 /* turn this into a four-byte sequence */ 1001 bytes[3]=(char)(0x30+linear%10); linear/=10; 1002 bytes[2]=(char)(0x81+linear%126); linear/=126; 1003 bytes[1]=(char)(0x30+linear%10); linear/=10; 1004 bytes[0]=(char)(0x81+linear); 1005 1006 /* output this sequence */ 1007 ucnv_fromUWriteBytes(cnv, 1008 bytes, 4, (char **)target, (char *)targetLimit, 1009 offsets, sourceIndex, pErrorCode); 1010 return 0; 1011 } 1012 } 1013 } 1014 1015 /* no mapping */ 1016 *pErrorCode=U_INVALID_CHAR_FOUND; 1017 return cp; 1018} 1019 1020/* 1021 * Input sequence: cnv->toUBytes[0..length[ 1022 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1023 * else return 0 after output has been written to the target 1024 */ 1025static int8_t 1026_extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1027 int8_t length, 1028 const uint8_t **source, const uint8_t *sourceLimit, 1029 UChar **target, const UChar *targetLimit, 1030 int32_t **offsets, int32_t sourceIndex, 1031 UBool flush, 1032 UErrorCode *pErrorCode) { 1033 const int32_t *cx; 1034 1035 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1036 ucnv_extInitialMatchToU( 1037 cnv, cx, 1038 length, (const char **)source, (const char *)sourceLimit, 1039 target, targetLimit, 1040 offsets, sourceIndex, 1041 flush, 1042 pErrorCode) 1043 ) { 1044 return 0; /* an extension mapping handled the input */ 1045 } 1046 1047 /* GB 18030 */ 1048 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1049 const uint32_t *range; 1050 uint32_t linear; 1051 int32_t i; 1052 1053 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1054 range=gb18030Ranges[0]; 1055 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 1056 if(range[2]<=linear && linear<=range[3]) { 1057 /* found the sequence, output the Unicode code point for it */ 1058 *pErrorCode=U_ZERO_ERROR; 1059 1060 /* add the linear difference between the input and start sequences to the start code point */ 1061 linear=range[0]+(linear-range[2]); 1062 1063 /* output this code point */ 1064 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1065 1066 return 0; 1067 } 1068 } 1069 } 1070 1071 /* no mapping */ 1072 *pErrorCode=U_INVALID_CHAR_FOUND; 1073 return length; 1074} 1075 1076/* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1077 1078/* 1079 * This code modifies a standard EBCDIC<->Unicode mapping table for 1080 * OS/390 (z/OS) Unix System Services (Open Edition). 1081 * The difference is in the mapping of Line Feed and New Line control codes: 1082 * Standard EBCDIC maps 1083 * 1084 * <U000A> \x25 |0 1085 * <U0085> \x15 |0 1086 * 1087 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1088 * mapping 1089 * 1090 * <U000A> \x15 |0 1091 * <U0085> \x25 |0 1092 * 1093 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1094 * by copying it into allocated memory and swapping the LF and NL values. 1095 * It allows to support the same EBCDIC charset in both versions without 1096 * duplicating the entire installed table. 1097 */ 1098 1099/* standard EBCDIC codes */ 1100#define EBCDIC_LF 0x25 1101#define EBCDIC_NL 0x15 1102 1103/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1104#define EBCDIC_RT_LF 0xf25 1105#define EBCDIC_RT_NL 0xf15 1106 1107/* Unicode code points */ 1108#define U_LF 0x0a 1109#define U_NL 0x85 1110 1111static UBool 1112_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1113 UConverterMBCSTable *mbcsTable; 1114 1115 const uint16_t *table, *results; 1116 const uint8_t *bytes; 1117 1118 int32_t (*newStateTable)[256]; 1119 uint16_t *newResults; 1120 uint8_t *p; 1121 char *name; 1122 1123 uint32_t stage2Entry; 1124 uint32_t size, sizeofFromUBytes; 1125 1126 mbcsTable=&sharedData->mbcs; 1127 1128 table=mbcsTable->fromUnicodeTable; 1129 bytes=mbcsTable->fromUnicodeBytes; 1130 results=(const uint16_t *)bytes; 1131 1132 /* 1133 * Check that this is an EBCDIC table with SBCS portion - 1134 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1135 * 1136 * If not, ignore the option. Options are always ignored if they do not apply. 1137 */ 1138 if(!( 1139 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1140 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1141 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1142 )) { 1143 return FALSE; 1144 } 1145 1146 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1147 if(!( 1148 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1149 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1150 )) { 1151 return FALSE; 1152 } 1153 } else /* MBCS_OUTPUT_2_SISO */ { 1154 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1155 if(!( 1156 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1157 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1158 )) { 1159 return FALSE; 1160 } 1161 1162 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1163 if(!( 1164 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1165 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1166 )) { 1167 return FALSE; 1168 } 1169 } 1170 1171 if(mbcsTable->fromUBytesLength>0) { 1172 /* 1173 * We _know_ the number of bytes in the fromUnicodeBytes array 1174 * starting with header.version 4.1. 1175 */ 1176 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1177 } else { 1178 /* 1179 * Otherwise: 1180 * There used to be code to enumerate the fromUnicode 1181 * trie and find the highest entry, but it was removed in ICU 3.2 1182 * because it was not tested and caused a low code coverage number. 1183 * See Jitterbug 3674. 1184 * This affects only some .cnv file formats with a header.version 1185 * below 4.1, and only when swaplfnl is requested. 1186 * 1187 * ucnvmbcs.c revision 1.99 is the last one with the 1188 * ucnv_MBCSSizeofFromUBytes() function. 1189 */ 1190 *pErrorCode=U_INVALID_FORMAT_ERROR; 1191 return FALSE; 1192 } 1193 1194 /* 1195 * The table has an appropriate format. 1196 * Allocate and build 1197 * - a modified to-Unicode state table 1198 * - a modified from-Unicode output array 1199 * - a converter name string with the swap option appended 1200 */ 1201 size= 1202 mbcsTable->countStates*1024+ 1203 sizeofFromUBytes+ 1204 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1205 p=(uint8_t *)uprv_malloc(size); 1206 if(p==NULL) { 1207 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1208 return FALSE; 1209 } 1210 1211 /* copy and modify the to-Unicode state table */ 1212 newStateTable=(int32_t (*)[256])p; 1213 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1214 1215 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1216 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1217 1218 /* copy and modify the from-Unicode result table */ 1219 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1220 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1221 1222 /* conveniently, the table access macros work on the left side of expressions */ 1223 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1224 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1225 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1226 } else /* MBCS_OUTPUT_2_SISO */ { 1227 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1228 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1229 1230 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1231 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1232 } 1233 1234 /* set the canonical converter name */ 1235 name=(char *)newResults+sizeofFromUBytes; 1236 uprv_strcpy(name, sharedData->staticData->name); 1237 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1238 1239 /* set the pointers */ 1240 umtx_lock(NULL); 1241 if(mbcsTable->swapLFNLStateTable==NULL) { 1242 mbcsTable->swapLFNLStateTable=newStateTable; 1243 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1244 mbcsTable->swapLFNLName=name; 1245 1246 newStateTable=NULL; 1247 } 1248 umtx_unlock(NULL); 1249 1250 /* release the allocated memory if another thread beat us to it */ 1251 if(newStateTable!=NULL) { 1252 uprv_free(newStateTable); 1253 } 1254 return TRUE; 1255} 1256 1257/* reconstitute omitted fromUnicode data ------------------------------------ */ 1258 1259/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1260static UBool U_CALLCONV 1261writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1262 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1263 const uint16_t *table; 1264 uint32_t *stage2; 1265 uint8_t *bytes, *p; 1266 UChar32 c; 1267 int32_t i, st3; 1268 1269 table=mbcsTable->fromUnicodeTable; 1270 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1271 1272 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1273 switch(mbcsTable->outputType) { 1274 case MBCS_OUTPUT_3_EUC: 1275 if(value<=0xffff) { 1276 /* short sequences are stored directly */ 1277 /* code set 0 or 1 */ 1278 } else if(value<=0x8effff) { 1279 /* code set 2 */ 1280 value&=0x7fff; 1281 } else /* first byte is 0x8f */ { 1282 /* code set 3 */ 1283 value&=0xff7f; 1284 } 1285 break; 1286 case MBCS_OUTPUT_4_EUC: 1287 if(value<=0xffffff) { 1288 /* short sequences are stored directly */ 1289 /* code set 0 or 1 */ 1290 } else if(value<=0x8effffff) { 1291 /* code set 2 */ 1292 value&=0x7fffff; 1293 } else /* first byte is 0x8f */ { 1294 /* code set 3 */ 1295 value&=0xff7fff; 1296 } 1297 break; 1298 default: 1299 break; 1300 } 1301 1302 for(i=0; i<=0x1f; ++value, ++i) { 1303 c=codePoints[i]; 1304 if(c<0) { 1305 continue; 1306 } 1307 1308 /* locate the stage 2 & 3 data */ 1309 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1310 p=bytes; 1311 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1312 1313 /* write the codepage bytes into stage 3 */ 1314 switch(mbcsTable->outputType) { 1315 case MBCS_OUTPUT_3: 1316 case MBCS_OUTPUT_4_EUC: 1317 p+=st3*3; 1318 p[0]=(uint8_t)(value>>16); 1319 p[1]=(uint8_t)(value>>8); 1320 p[2]=(uint8_t)value; 1321 break; 1322 case MBCS_OUTPUT_4: 1323 ((uint32_t *)p)[st3]=value; 1324 break; 1325 default: 1326 /* 2 bytes per character */ 1327 ((uint16_t *)p)[st3]=(uint16_t)value; 1328 break; 1329 } 1330 1331 /* set the roundtrip flag */ 1332 *stage2|=(1UL<<(16+(c&0xf))); 1333 } 1334 return TRUE; 1335 } 1336 1337static void 1338reconstituteData(UConverterMBCSTable *mbcsTable, 1339 uint32_t stage1Length, uint32_t stage2Length, 1340 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1341 UErrorCode *pErrorCode) { 1342 uint16_t *stage1; 1343 uint32_t *stage2; 1344 uint8_t *bytes; 1345 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1346 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1347 if(mbcsTable->reconstitutedData==NULL) { 1348 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1349 return; 1350 } 1351 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1352 1353 /* copy existing data and reroute the pointers */ 1354 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1355 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1356 1357 stage2=(uint32_t *)(stage1+stage1Length); 1358 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1359 mbcsTable->fromUnicodeTable+stage1Length, 1360 stage2Length*4); 1361 1362 mbcsTable->fromUnicodeTable=stage1; 1363 mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length); 1364 1365 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1366 stage2=(uint32_t *)stage1; 1367 1368 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1369 { 1370 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1371 int32_t stageUTF8Index=0; 1372 int32_t st1, st2, st3, i; 1373 1374 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1375 st2=stage1[st1]; 1376 if(st2!=stage1Length/2) { 1377 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1378 for(i=0; i<16; ++i) { 1379 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1380 if(st3!=0) { 1381 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1382 st3>>=4; 1383 /* 1384 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1385 * allocated together as a single 64-block for access from the mbcsIndex 1386 */ 1387 stage2[st2++]=st3++; 1388 stage2[st2++]=st3++; 1389 stage2[st2++]=st3++; 1390 stage2[st2++]=st3; 1391 } else { 1392 /* no stage 3 block, skip */ 1393 st2+=4; 1394 } 1395 } 1396 } else { 1397 /* no stage 2 block, skip */ 1398 stageUTF8Index+=16; 1399 } 1400 } 1401 } 1402 1403 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1404 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1405} 1406 1407/* MBCS setup functions ----------------------------------------------------- */ 1408 1409static void 1410ucnv_MBCSLoad(UConverterSharedData *sharedData, 1411 UConverterLoadArgs *pArgs, 1412 const uint8_t *raw, 1413 UErrorCode *pErrorCode) { 1414 UDataInfo info; 1415 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1416 _MBCSHeader *header=(_MBCSHeader *)raw; 1417 uint32_t offset; 1418 uint32_t headerLength; 1419 UBool noFromU=FALSE; 1420 1421 if(header->version[0]==4) { 1422 headerLength=MBCS_HEADER_V4_LENGTH; 1423 } else if(header->version[0]==5 && header->version[1]>=3 && 1424 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1425 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1426 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1427 } else { 1428 *pErrorCode=U_INVALID_TABLE_FORMAT; 1429 return; 1430 } 1431 1432 mbcsTable->outputType=(uint8_t)header->flags; 1433 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1434 *pErrorCode=U_INVALID_TABLE_FORMAT; 1435 return; 1436 } 1437 1438 /* extension data, header version 4.2 and higher */ 1439 offset=header->flags>>8; 1440 if(offset!=0) { 1441 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1442 } 1443 1444 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1445 UConverterLoadArgs args={ 0 }; 1446 UConverterSharedData *baseSharedData; 1447 const int32_t *extIndexes; 1448 const char *baseName; 1449 1450 /* extension-only file, load the base table and set values appropriately */ 1451 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1452 /* extension-only file without extension */ 1453 *pErrorCode=U_INVALID_TABLE_FORMAT; 1454 return; 1455 } 1456 1457 if(pArgs->nestedLoads!=1) { 1458 /* an extension table must not be loaded as a base table */ 1459 *pErrorCode=U_INVALID_TABLE_FILE; 1460 return; 1461 } 1462 1463 /* load the base table */ 1464 baseName=(const char *)header+headerLength*4; 1465 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1466 /* forbid loading this same extension-only file */ 1467 *pErrorCode=U_INVALID_TABLE_FORMAT; 1468 return; 1469 } 1470 1471 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1472 args.size=sizeof(UConverterLoadArgs); 1473 args.nestedLoads=2; 1474 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1475 args.reserved=pArgs->reserved; 1476 args.options=pArgs->options; 1477 args.pkg=pArgs->pkg; 1478 args.name=baseName; 1479 baseSharedData=ucnv_load(&args, pErrorCode); 1480 if(U_FAILURE(*pErrorCode)) { 1481 return; 1482 } 1483 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1484 baseSharedData->mbcs.baseSharedData!=NULL 1485 ) { 1486 ucnv_unload(baseSharedData); 1487 *pErrorCode=U_INVALID_TABLE_FORMAT; 1488 return; 1489 } 1490 if(pArgs->onlyTestIsLoadable) { 1491 /* 1492 * Exit as soon as we know that we can load the converter 1493 * and the format is valid and supported. 1494 * The worst that can happen in the following code is a memory 1495 * allocation error. 1496 */ 1497 ucnv_unload(baseSharedData); 1498 return; 1499 } 1500 1501 /* copy the base table data */ 1502 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1503 1504 /* overwrite values with relevant ones for the extension converter */ 1505 mbcsTable->baseSharedData=baseSharedData; 1506 mbcsTable->extIndexes=extIndexes; 1507 1508 /* 1509 * It would be possible to share the swapLFNL data with a base converter, 1510 * but the generated name would have to be different, and the memory 1511 * would have to be free'd only once. 1512 * It is easier to just create the data for the extension converter 1513 * separately when it is requested. 1514 */ 1515 mbcsTable->swapLFNLStateTable=NULL; 1516 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1517 mbcsTable->swapLFNLName=NULL; 1518 1519 /* 1520 * The reconstitutedData must be deleted only when the base converter 1521 * is unloaded. 1522 */ 1523 mbcsTable->reconstitutedData=NULL; 1524 1525 /* 1526 * Set a special, runtime-only outputType if the extension converter 1527 * is a DBCS version of a base converter that also maps single bytes. 1528 */ 1529 if( sharedData->staticData->conversionType==UCNV_DBCS || 1530 (sharedData->staticData->conversionType==UCNV_MBCS && 1531 sharedData->staticData->minBytesPerChar>=2) 1532 ) { 1533 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1534 /* the base converter is SI/SO-stateful */ 1535 int32_t entry; 1536 1537 /* get the dbcs state from the state table entry for SO=0x0e */ 1538 entry=mbcsTable->stateTable[0][0xe]; 1539 if( MBCS_ENTRY_IS_FINAL(entry) && 1540 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1541 MBCS_ENTRY_FINAL_STATE(entry)!=0 1542 ) { 1543 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1544 1545 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1546 } 1547 } else if( 1548 baseSharedData->staticData->conversionType==UCNV_MBCS && 1549 baseSharedData->staticData->minBytesPerChar==1 && 1550 baseSharedData->staticData->maxBytesPerChar==2 && 1551 mbcsTable->countStates<=127 1552 ) { 1553 /* non-stateful base converter, need to modify the state table */ 1554 int32_t (*newStateTable)[256]; 1555 int32_t *state; 1556 int32_t i, count; 1557 1558 /* allocate a new state table and copy the base state table contents */ 1559 count=mbcsTable->countStates; 1560 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1561 if(newStateTable==NULL) { 1562 ucnv_unload(baseSharedData); 1563 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1564 return; 1565 } 1566 1567 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1568 1569 /* change all final single-byte entries to go to a new all-illegal state */ 1570 state=newStateTable[0]; 1571 for(i=0; i<256; ++i) { 1572 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1573 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1574 } 1575 } 1576 1577 /* build the new all-illegal state */ 1578 state=newStateTable[count]; 1579 for(i=0; i<256; ++i) { 1580 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1581 } 1582 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1583 mbcsTable->countStates=(uint8_t)(count+1); 1584 mbcsTable->stateTableOwned=TRUE; 1585 1586 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1587 } 1588 } 1589 1590 /* 1591 * unlike below for files with base tables, do not get the unicodeMask 1592 * from the sharedData; instead, use the base table's unicodeMask, 1593 * which we copied in the memcpy above; 1594 * this is necessary because the static data unicodeMask, especially 1595 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1596 */ 1597 } else { 1598 /* conversion file with a base table; an additional extension table is optional */ 1599 /* make sure that the output type is known */ 1600 switch(mbcsTable->outputType) { 1601 case MBCS_OUTPUT_1: 1602 case MBCS_OUTPUT_2: 1603 case MBCS_OUTPUT_3: 1604 case MBCS_OUTPUT_4: 1605 case MBCS_OUTPUT_3_EUC: 1606 case MBCS_OUTPUT_4_EUC: 1607 case MBCS_OUTPUT_2_SISO: 1608 /* OK */ 1609 break; 1610 default: 1611 *pErrorCode=U_INVALID_TABLE_FORMAT; 1612 return; 1613 } 1614 if(pArgs->onlyTestIsLoadable) { 1615 /* 1616 * Exit as soon as we know that we can load the converter 1617 * and the format is valid and supported. 1618 * The worst that can happen in the following code is a memory 1619 * allocation error. 1620 */ 1621 return; 1622 } 1623 1624 mbcsTable->countStates=(uint8_t)header->countStates; 1625 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1626 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1627 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1628 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1629 1630 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1631 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1632 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1633 1634 /* 1635 * converter versions 6.1 and up contain a unicodeMask that is 1636 * used here to select the most efficient function implementations 1637 */ 1638 info.size=sizeof(UDataInfo); 1639 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1640 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1641 /* mask off possible future extensions to be safe */ 1642 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1643 } else { 1644 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1645 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1646 } 1647 1648 /* 1649 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1650 * Check for the header version, SBCS vs. MBCS, and for whether the 1651 * data structures are optimized for code points as high as what the 1652 * runtime code is designed for. 1653 * The implementation does not handle mapping tables with entries for 1654 * unpaired surrogates. 1655 */ 1656 if( header->version[1]>=3 && 1657 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1658 (mbcsTable->countStates==1 ? 1659 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1660 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1661 ) 1662 ) { 1663 mbcsTable->utf8Friendly=TRUE; 1664 1665 if(mbcsTable->countStates==1) { 1666 /* 1667 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1668 * Build a table with indexes to each block, to be used instead of 1669 * the regular stage 1/2 table. 1670 */ 1671 int32_t i; 1672 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1673 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1674 } 1675 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1676 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1677 } else { 1678 /* 1679 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1680 * The .cnv file is prebuilt with an additional stage table with indexes 1681 * to each block. 1682 */ 1683 mbcsTable->mbcsIndex=(const uint16_t *) 1684 (mbcsTable->fromUnicodeBytes+ 1685 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1686 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1687 } 1688 } 1689 1690 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1691 { 1692 uint32_t asciiRoundtrips=0xffffffff; 1693 int32_t i; 1694 1695 for(i=0; i<0x80; ++i) { 1696 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1697 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1698 } 1699 } 1700 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1701 } 1702 1703 if(noFromU) { 1704 uint32_t stage1Length= 1705 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1706 0x440 : 0x40; 1707 uint32_t stage2Length= 1708 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1709 stage1Length/2; 1710 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1711 } 1712 } 1713 1714 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1715 if(mbcsTable->utf8Friendly) { 1716 if(mbcsTable->countStates==1) { 1717 sharedData->impl=&_SBCSUTF8Impl; 1718 } else { 1719 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1720 sharedData->impl=&_DBCSUTF8Impl; 1721 } 1722 } 1723 } 1724 1725 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1726 /* 1727 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1728 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1729 */ 1730 mbcsTable->asciiRoundtrips=0; 1731 } 1732} 1733 1734static void 1735ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1736 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1737 1738 if(mbcsTable->swapLFNLStateTable!=NULL) { 1739 uprv_free(mbcsTable->swapLFNLStateTable); 1740 } 1741 if(mbcsTable->stateTableOwned) { 1742 uprv_free((void *)mbcsTable->stateTable); 1743 } 1744 if(mbcsTable->baseSharedData!=NULL) { 1745 ucnv_unload(mbcsTable->baseSharedData); 1746 } 1747 if(mbcsTable->reconstitutedData!=NULL) { 1748 uprv_free(mbcsTable->reconstitutedData); 1749 } 1750} 1751 1752static void 1753ucnv_MBCSOpen(UConverter *cnv, 1754 UConverterLoadArgs *pArgs, 1755 UErrorCode *pErrorCode) { 1756 UConverterMBCSTable *mbcsTable; 1757 const int32_t *extIndexes; 1758 uint8_t outputType; 1759 int8_t maxBytesPerUChar; 1760 1761 if(pArgs->onlyTestIsLoadable) { 1762 return; 1763 } 1764 1765 mbcsTable=&cnv->sharedData->mbcs; 1766 outputType=mbcsTable->outputType; 1767 1768 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1769 /* the swaplfnl option does not apply, remove it */ 1770 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1771 } 1772 1773 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1774 /* do this because double-checked locking is broken */ 1775 UBool isCached; 1776 1777 umtx_lock(NULL); 1778 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1779 umtx_unlock(NULL); 1780 1781 if(!isCached) { 1782 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1783 if(U_FAILURE(*pErrorCode)) { 1784 return; /* something went wrong */ 1785 } 1786 1787 /* the option does not apply, remove it */ 1788 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1789 } 1790 } 1791 } 1792 1793 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1794 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1795 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1796 cnv->options|=_MBCS_OPTION_GB18030; 1797 } 1798 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1799 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1800 cnv->options|=_MBCS_OPTION_KEIS; 1801 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1802 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1803 cnv->options|=_MBCS_OPTION_JEF; 1804 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1805 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1806 cnv->options|=_MBCS_OPTION_JIPS; 1807 } 1808 1809 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1810 if(outputType==MBCS_OUTPUT_2_SISO) { 1811 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1812 } 1813 1814 extIndexes=mbcsTable->extIndexes; 1815 if(extIndexes!=NULL) { 1816 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1817 if(outputType==MBCS_OUTPUT_2_SISO) { 1818 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1819 } 1820 1821 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1822 cnv->maxBytesPerUChar=maxBytesPerUChar; 1823 } 1824 } 1825 1826#if 0 1827 /* 1828 * documentation of UConverter fields used for status 1829 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1830 */ 1831 1832 /* toUnicode */ 1833 cnv->toUnicodeStatus=0; /* offset */ 1834 cnv->mode=0; /* state */ 1835 cnv->toULength=0; /* byteIndex */ 1836 1837 /* fromUnicode */ 1838 cnv->fromUChar32=0; 1839 cnv->fromUnicodeStatus=1; /* prevLength */ 1840#endif 1841} 1842 1843static const char * 1844ucnv_MBCSGetName(const UConverter *cnv) { 1845 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1846 return cnv->sharedData->mbcs.swapLFNLName; 1847 } else { 1848 return cnv->sharedData->staticData->name; 1849 } 1850} 1851 1852/* MBCS-to-Unicode conversion functions ------------------------------------- */ 1853 1854static UChar32 1855ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1856 const _MBCSToUFallback *toUFallbacks; 1857 uint32_t i, start, limit; 1858 1859 limit=mbcsTable->countToUFallbacks; 1860 if(limit>0) { 1861 /* do a binary search for the fallback mapping */ 1862 toUFallbacks=mbcsTable->toUFallbacks; 1863 start=0; 1864 while(start<limit-1) { 1865 i=(start+limit)/2; 1866 if(offset<toUFallbacks[i].offset) { 1867 limit=i; 1868 } else { 1869 start=i; 1870 } 1871 } 1872 1873 /* did we really find it? */ 1874 if(offset==toUFallbacks[start].offset) { 1875 return toUFallbacks[start].codePoint; 1876 } 1877 } 1878 1879 return 0xfffe; 1880} 1881 1882/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1883static void 1884ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1885 UErrorCode *pErrorCode) { 1886 UConverter *cnv; 1887 const uint8_t *source, *sourceLimit; 1888 UChar *target; 1889 const UChar *targetLimit; 1890 int32_t *offsets; 1891 1892 const int32_t (*stateTable)[256]; 1893 1894 int32_t sourceIndex; 1895 1896 int32_t entry; 1897 UChar c; 1898 uint8_t action; 1899 1900 /* set up the local pointers */ 1901 cnv=pArgs->converter; 1902 source=(const uint8_t *)pArgs->source; 1903 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1904 target=pArgs->target; 1905 targetLimit=pArgs->targetLimit; 1906 offsets=pArgs->offsets; 1907 1908 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1909 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1910 } else { 1911 stateTable=cnv->sharedData->mbcs.stateTable; 1912 } 1913 1914 /* sourceIndex=-1 if the current character began in the previous buffer */ 1915 sourceIndex=0; 1916 1917 /* conversion loop */ 1918 while(source<sourceLimit) { 1919 /* 1920 * This following test is to see if available input would overflow the output. 1921 * It does not catch output of more than one code unit that 1922 * overflows as a result of a surrogate pair or callback output 1923 * from the last source byte. 1924 * Therefore, those situations also test for overflows and will 1925 * then break the loop, too. 1926 */ 1927 if(target>=targetLimit) { 1928 /* target is full */ 1929 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1930 break; 1931 } 1932 1933 entry=stateTable[0][*source++]; 1934 /* MBCS_ENTRY_IS_FINAL(entry) */ 1935 1936 /* test the most common case first */ 1937 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1938 /* output BMP code point */ 1939 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1940 if(offsets!=NULL) { 1941 *offsets++=sourceIndex; 1942 } 1943 1944 /* normal end of action codes: prepare for a new character */ 1945 ++sourceIndex; 1946 continue; 1947 } 1948 1949 /* 1950 * An if-else-if chain provides more reliable performance for 1951 * the most common cases compared to a switch. 1952 */ 1953 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1954 if(action==MBCS_STATE_VALID_DIRECT_20 || 1955 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1956 ) { 1957 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1958 /* output surrogate pair */ 1959 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1960 if(offsets!=NULL) { 1961 *offsets++=sourceIndex; 1962 } 1963 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1964 if(target<targetLimit) { 1965 *target++=c; 1966 if(offsets!=NULL) { 1967 *offsets++=sourceIndex; 1968 } 1969 } else { 1970 /* target overflow */ 1971 cnv->UCharErrorBuffer[0]=c; 1972 cnv->UCharErrorBufferLength=1; 1973 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1974 break; 1975 } 1976 1977 ++sourceIndex; 1978 continue; 1979 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1980 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1981 /* output BMP code point */ 1982 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1983 if(offsets!=NULL) { 1984 *offsets++=sourceIndex; 1985 } 1986 1987 ++sourceIndex; 1988 continue; 1989 } 1990 } else if(action==MBCS_STATE_UNASSIGNED) { 1991 /* just fall through */ 1992 } else if(action==MBCS_STATE_ILLEGAL) { 1993 /* callback(illegal) */ 1994 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1995 } else { 1996 /* reserved, must never occur */ 1997 ++sourceIndex; 1998 continue; 1999 } 2000 2001 if(U_FAILURE(*pErrorCode)) { 2002 /* callback(illegal) */ 2003 break; 2004 } else /* unassigned sequences indicated with byteIndex>0 */ { 2005 /* try an extension mapping */ 2006 pArgs->source=(const char *)source; 2007 cnv->toUBytes[0]=*(source-1); 2008 cnv->toULength=_extToU(cnv, cnv->sharedData, 2009 1, &source, sourceLimit, 2010 &target, targetLimit, 2011 &offsets, sourceIndex, 2012 pArgs->flush, 2013 pErrorCode); 2014 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2015 2016 if(U_FAILURE(*pErrorCode)) { 2017 /* not mappable or buffer overflow */ 2018 break; 2019 } 2020 } 2021 } 2022 2023 /* write back the updated pointers */ 2024 pArgs->source=(const char *)source; 2025 pArgs->target=target; 2026 pArgs->offsets=offsets; 2027} 2028 2029/* 2030 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2031 * that only map to and from the BMP. 2032 * In addition to single-byte optimizations, the offset calculations 2033 * become much easier. 2034 */ 2035static void 2036ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2037 UErrorCode *pErrorCode) { 2038 UConverter *cnv; 2039 const uint8_t *source, *sourceLimit, *lastSource; 2040 UChar *target; 2041 int32_t targetCapacity, length; 2042 int32_t *offsets; 2043 2044 const int32_t (*stateTable)[256]; 2045 2046 int32_t sourceIndex; 2047 2048 int32_t entry; 2049 uint8_t action; 2050 2051 /* set up the local pointers */ 2052 cnv=pArgs->converter; 2053 source=(const uint8_t *)pArgs->source; 2054 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2055 target=pArgs->target; 2056 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2057 offsets=pArgs->offsets; 2058 2059 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2060 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2061 } else { 2062 stateTable=cnv->sharedData->mbcs.stateTable; 2063 } 2064 2065 /* sourceIndex=-1 if the current character began in the previous buffer */ 2066 sourceIndex=0; 2067 lastSource=source; 2068 2069 /* 2070 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2071 * for the minimum of the sourceLength and targetCapacity 2072 */ 2073 length=(int32_t)(sourceLimit-source); 2074 if(length<targetCapacity) { 2075 targetCapacity=length; 2076 } 2077 2078#if MBCS_UNROLL_SINGLE_TO_BMP 2079 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2080 /* unroll the loop with the most common case */ 2081unrolled: 2082 if(targetCapacity>=16) { 2083 int32_t count, loops, oredEntries; 2084 2085 loops=count=targetCapacity>>4; 2086 do { 2087 oredEntries=entry=stateTable[0][*source++]; 2088 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2089 oredEntries|=entry=stateTable[0][*source++]; 2090 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2091 oredEntries|=entry=stateTable[0][*source++]; 2092 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2093 oredEntries|=entry=stateTable[0][*source++]; 2094 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2095 oredEntries|=entry=stateTable[0][*source++]; 2096 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2097 oredEntries|=entry=stateTable[0][*source++]; 2098 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2099 oredEntries|=entry=stateTable[0][*source++]; 2100 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2101 oredEntries|=entry=stateTable[0][*source++]; 2102 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2103 oredEntries|=entry=stateTable[0][*source++]; 2104 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2105 oredEntries|=entry=stateTable[0][*source++]; 2106 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2107 oredEntries|=entry=stateTable[0][*source++]; 2108 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2109 oredEntries|=entry=stateTable[0][*source++]; 2110 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2111 oredEntries|=entry=stateTable[0][*source++]; 2112 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2113 oredEntries|=entry=stateTable[0][*source++]; 2114 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2115 oredEntries|=entry=stateTable[0][*source++]; 2116 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2117 oredEntries|=entry=stateTable[0][*source++]; 2118 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2119 2120 /* were all 16 entries really valid? */ 2121 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2122 /* no, return to the first of these 16 */ 2123 source-=16; 2124 target-=16; 2125 break; 2126 } 2127 } while(--count>0); 2128 count=loops-count; 2129 targetCapacity-=16*count; 2130 2131 if(offsets!=NULL) { 2132 lastSource+=16*count; 2133 while(count>0) { 2134 *offsets++=sourceIndex++; 2135 *offsets++=sourceIndex++; 2136 *offsets++=sourceIndex++; 2137 *offsets++=sourceIndex++; 2138 *offsets++=sourceIndex++; 2139 *offsets++=sourceIndex++; 2140 *offsets++=sourceIndex++; 2141 *offsets++=sourceIndex++; 2142 *offsets++=sourceIndex++; 2143 *offsets++=sourceIndex++; 2144 *offsets++=sourceIndex++; 2145 *offsets++=sourceIndex++; 2146 *offsets++=sourceIndex++; 2147 *offsets++=sourceIndex++; 2148 *offsets++=sourceIndex++; 2149 *offsets++=sourceIndex++; 2150 --count; 2151 } 2152 } 2153 } 2154#endif 2155 2156 /* conversion loop */ 2157 while(targetCapacity > 0 && source < sourceLimit) { 2158 entry=stateTable[0][*source++]; 2159 /* MBCS_ENTRY_IS_FINAL(entry) */ 2160 2161 /* test the most common case first */ 2162 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2163 /* output BMP code point */ 2164 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2165 --targetCapacity; 2166 continue; 2167 } 2168 2169 /* 2170 * An if-else-if chain provides more reliable performance for 2171 * the most common cases compared to a switch. 2172 */ 2173 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2174 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2175 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2176 /* output BMP code point */ 2177 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2178 --targetCapacity; 2179 continue; 2180 } 2181 } else if(action==MBCS_STATE_UNASSIGNED) { 2182 /* just fall through */ 2183 } else if(action==MBCS_STATE_ILLEGAL) { 2184 /* callback(illegal) */ 2185 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2186 } else { 2187 /* reserved, must never occur */ 2188 continue; 2189 } 2190 2191 /* set offsets since the start or the last extension */ 2192 if(offsets!=NULL) { 2193 int32_t count=(int32_t)(source-lastSource); 2194 2195 /* predecrement: do not set the offset for the callback-causing character */ 2196 while(--count>0) { 2197 *offsets++=sourceIndex++; 2198 } 2199 /* offset and sourceIndex are now set for the current character */ 2200 } 2201 2202 if(U_FAILURE(*pErrorCode)) { 2203 /* callback(illegal) */ 2204 break; 2205 } else /* unassigned sequences indicated with byteIndex>0 */ { 2206 /* try an extension mapping */ 2207 lastSource=source; 2208 cnv->toUBytes[0]=*(source-1); 2209 cnv->toULength=_extToU(cnv, cnv->sharedData, 2210 1, &source, sourceLimit, 2211 &target, pArgs->targetLimit, 2212 &offsets, sourceIndex, 2213 pArgs->flush, 2214 pErrorCode); 2215 sourceIndex+=1+(int32_t)(source-lastSource); 2216 2217 if(U_FAILURE(*pErrorCode)) { 2218 /* not mappable or buffer overflow */ 2219 break; 2220 } 2221 2222 /* recalculate the targetCapacity after an extension mapping */ 2223 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2224 length=(int32_t)(sourceLimit-source); 2225 if(length<targetCapacity) { 2226 targetCapacity=length; 2227 } 2228 } 2229 2230#if MBCS_UNROLL_SINGLE_TO_BMP 2231 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2232 goto unrolled; 2233#endif 2234 } 2235 2236 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2237 /* target is full */ 2238 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2239 } 2240 2241 /* set offsets since the start or the last callback */ 2242 if(offsets!=NULL) { 2243 size_t count=source-lastSource; 2244 while(count>0) { 2245 *offsets++=sourceIndex++; 2246 --count; 2247 } 2248 } 2249 2250 /* write back the updated pointers */ 2251 pArgs->source=(const char *)source; 2252 pArgs->target=target; 2253 pArgs->offsets=offsets; 2254} 2255 2256static UBool 2257hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2258 const int32_t *row=stateTable[state]; 2259 int32_t b, entry; 2260 /* First test for final entries in this state for some commonly valid byte values. */ 2261 entry=row[0xa1]; 2262 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2263 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2264 ) { 2265 return TRUE; 2266 } 2267 entry=row[0x41]; 2268 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2269 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2270 ) { 2271 return TRUE; 2272 } 2273 /* Then test for final entries in this state. */ 2274 for(b=0; b<=0xff; ++b) { 2275 entry=row[b]; 2276 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2277 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2278 ) { 2279 return TRUE; 2280 } 2281 } 2282 /* Then recurse for transition entries. */ 2283 for(b=0; b<=0xff; ++b) { 2284 entry=row[b]; 2285 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2286 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2287 ) { 2288 return TRUE; 2289 } 2290 } 2291 return FALSE; 2292} 2293 2294/* 2295 * Is byte b a single/lead byte in this state? 2296 * Recurse for transition states, because here we don't want to say that 2297 * b is a lead byte if all byte sequences that start with b are illegal. 2298 */ 2299static UBool 2300isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2301 const int32_t *row=stateTable[state]; 2302 int32_t entry=row[b]; 2303 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2304 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2305 } else { 2306 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2307 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2308 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2309 } else { 2310 return action!=MBCS_STATE_ILLEGAL; 2311 } 2312 } 2313} 2314 2315U_CFUNC void 2316ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2317 UErrorCode *pErrorCode) { 2318 UConverter *cnv; 2319 const uint8_t *source, *sourceLimit; 2320 UChar *target; 2321 const UChar *targetLimit; 2322 int32_t *offsets; 2323 2324 const int32_t (*stateTable)[256]; 2325 const uint16_t *unicodeCodeUnits; 2326 2327 uint32_t offset; 2328 uint8_t state; 2329 int8_t byteIndex; 2330 uint8_t *bytes; 2331 2332 int32_t sourceIndex, nextSourceIndex; 2333 2334 int32_t entry; 2335 UChar c; 2336 uint8_t action; 2337 2338 /* use optimized function if possible */ 2339 cnv=pArgs->converter; 2340 2341 if(cnv->preToULength>0) { 2342 /* 2343 * pass sourceIndex=-1 because we continue from an earlier buffer 2344 * in the future, this may change with continuous offsets 2345 */ 2346 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2347 2348 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2349 return; 2350 } 2351 } 2352 2353 if(cnv->sharedData->mbcs.countStates==1) { 2354 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2355 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2356 } else { 2357 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2358 } 2359 return; 2360 } 2361 2362 /* set up the local pointers */ 2363 source=(const uint8_t *)pArgs->source; 2364 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2365 target=pArgs->target; 2366 targetLimit=pArgs->targetLimit; 2367 offsets=pArgs->offsets; 2368 2369 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2370 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2371 } else { 2372 stateTable=cnv->sharedData->mbcs.stateTable; 2373 } 2374 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2375 2376 /* get the converter state from UConverter */ 2377 offset=cnv->toUnicodeStatus; 2378 byteIndex=cnv->toULength; 2379 bytes=cnv->toUBytes; 2380 2381 /* 2382 * if we are in the SBCS state for a DBCS-only converter, 2383 * then load the DBCS state from the MBCS data 2384 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2385 */ 2386 if((state=(uint8_t)(cnv->mode))==0) { 2387 state=cnv->sharedData->mbcs.dbcsOnlyState; 2388 } 2389 2390 /* sourceIndex=-1 if the current character began in the previous buffer */ 2391 sourceIndex=byteIndex==0 ? 0 : -1; 2392 nextSourceIndex=0; 2393 2394 /* conversion loop */ 2395 while(source<sourceLimit) { 2396 /* 2397 * This following test is to see if available input would overflow the output. 2398 * It does not catch output of more than one code unit that 2399 * overflows as a result of a surrogate pair or callback output 2400 * from the last source byte. 2401 * Therefore, those situations also test for overflows and will 2402 * then break the loop, too. 2403 */ 2404 if(target>=targetLimit) { 2405 /* target is full */ 2406 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2407 break; 2408 } 2409 2410 if(byteIndex==0) { 2411 /* optimized loop for 1/2-byte input and BMP output */ 2412 if(offsets==NULL) { 2413 do { 2414 entry=stateTable[state][*source]; 2415 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2416 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2417 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2418 2419 ++source; 2420 if( source<sourceLimit && 2421 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2422 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2423 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2424 ) { 2425 ++source; 2426 *target++=c; 2427 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2428 offset=0; 2429 } else { 2430 /* set the state and leave the optimized loop */ 2431 bytes[0]=*(source-1); 2432 byteIndex=1; 2433 break; 2434 } 2435 } else { 2436 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2437 /* output BMP code point */ 2438 ++source; 2439 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2440 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2441 } else { 2442 /* leave the optimized loop */ 2443 break; 2444 } 2445 } 2446 } while(source<sourceLimit && target<targetLimit); 2447 } else /* offsets!=NULL */ { 2448 do { 2449 entry=stateTable[state][*source]; 2450 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2451 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2452 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2453 2454 ++source; 2455 if( source<sourceLimit && 2456 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2457 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2458 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2459 ) { 2460 ++source; 2461 *target++=c; 2462 if(offsets!=NULL) { 2463 *offsets++=sourceIndex; 2464 sourceIndex=(nextSourceIndex+=2); 2465 } 2466 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2467 offset=0; 2468 } else { 2469 /* set the state and leave the optimized loop */ 2470 ++nextSourceIndex; 2471 bytes[0]=*(source-1); 2472 byteIndex=1; 2473 break; 2474 } 2475 } else { 2476 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2477 /* output BMP code point */ 2478 ++source; 2479 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2480 if(offsets!=NULL) { 2481 *offsets++=sourceIndex; 2482 sourceIndex=++nextSourceIndex; 2483 } 2484 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2485 } else { 2486 /* leave the optimized loop */ 2487 break; 2488 } 2489 } 2490 } while(source<sourceLimit && target<targetLimit); 2491 } 2492 2493 /* 2494 * these tests and break statements could be put inside the loop 2495 * if C had "break outerLoop" like Java 2496 */ 2497 if(source>=sourceLimit) { 2498 break; 2499 } 2500 if(target>=targetLimit) { 2501 /* target is full */ 2502 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2503 break; 2504 } 2505 2506 ++nextSourceIndex; 2507 bytes[byteIndex++]=*source++; 2508 } else /* byteIndex>0 */ { 2509 ++nextSourceIndex; 2510 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2511 } 2512 2513 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2514 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2515 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2516 continue; 2517 } 2518 2519 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2520 cnv->mode=state; 2521 2522 /* set the next state early so that we can reuse the entry variable */ 2523 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2524 2525 /* 2526 * An if-else-if chain provides more reliable performance for 2527 * the most common cases compared to a switch. 2528 */ 2529 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2530 if(action==MBCS_STATE_VALID_16) { 2531 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2532 c=unicodeCodeUnits[offset]; 2533 if(c<0xfffe) { 2534 /* output BMP code point */ 2535 *target++=c; 2536 if(offsets!=NULL) { 2537 *offsets++=sourceIndex; 2538 } 2539 byteIndex=0; 2540 } else if(c==0xfffe) { 2541 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2542 /* output fallback BMP code point */ 2543 *target++=(UChar)entry; 2544 if(offsets!=NULL) { 2545 *offsets++=sourceIndex; 2546 } 2547 byteIndex=0; 2548 } 2549 } else { 2550 /* callback(illegal) */ 2551 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2552 } 2553 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2554 /* output BMP code point */ 2555 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2556 if(offsets!=NULL) { 2557 *offsets++=sourceIndex; 2558 } 2559 byteIndex=0; 2560 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2561 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2562 c=unicodeCodeUnits[offset++]; 2563 if(c<0xd800) { 2564 /* output BMP code point below 0xd800 */ 2565 *target++=c; 2566 if(offsets!=NULL) { 2567 *offsets++=sourceIndex; 2568 } 2569 byteIndex=0; 2570 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2571 /* output roundtrip or fallback surrogate pair */ 2572 *target++=(UChar)(c&0xdbff); 2573 if(offsets!=NULL) { 2574 *offsets++=sourceIndex; 2575 } 2576 byteIndex=0; 2577 if(target<targetLimit) { 2578 *target++=unicodeCodeUnits[offset]; 2579 if(offsets!=NULL) { 2580 *offsets++=sourceIndex; 2581 } 2582 } else { 2583 /* target overflow */ 2584 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2585 cnv->UCharErrorBufferLength=1; 2586 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2587 2588 offset=0; 2589 break; 2590 } 2591 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2592 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2593 *target++=unicodeCodeUnits[offset]; 2594 if(offsets!=NULL) { 2595 *offsets++=sourceIndex; 2596 } 2597 byteIndex=0; 2598 } else if(c==0xffff) { 2599 /* callback(illegal) */ 2600 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2601 } 2602 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2603 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2604 ) { 2605 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2606 /* output surrogate pair */ 2607 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2608 if(offsets!=NULL) { 2609 *offsets++=sourceIndex; 2610 } 2611 byteIndex=0; 2612 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2613 if(target<targetLimit) { 2614 *target++=c; 2615 if(offsets!=NULL) { 2616 *offsets++=sourceIndex; 2617 } 2618 } else { 2619 /* target overflow */ 2620 cnv->UCharErrorBuffer[0]=c; 2621 cnv->UCharErrorBufferLength=1; 2622 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2623 2624 offset=0; 2625 break; 2626 } 2627 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2628 /* 2629 * This serves as a state change without any output. 2630 * It is useful for reading simple stateful encodings, 2631 * for example using just Shift-In/Shift-Out codes. 2632 * The 21 unused bits may later be used for more sophisticated 2633 * state transitions. 2634 */ 2635 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2636 byteIndex=0; 2637 } else { 2638 /* SI/SO are illegal for DBCS-only conversion */ 2639 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2640 2641 /* callback(illegal) */ 2642 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2643 } 2644 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2645 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2646 /* output BMP code point */ 2647 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2648 if(offsets!=NULL) { 2649 *offsets++=sourceIndex; 2650 } 2651 byteIndex=0; 2652 } 2653 } else if(action==MBCS_STATE_UNASSIGNED) { 2654 /* just fall through */ 2655 } else if(action==MBCS_STATE_ILLEGAL) { 2656 /* callback(illegal) */ 2657 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2658 } else { 2659 /* reserved, must never occur */ 2660 byteIndex=0; 2661 } 2662 2663 /* end of action codes: prepare for a new character */ 2664 offset=0; 2665 2666 if(byteIndex==0) { 2667 sourceIndex=nextSourceIndex; 2668 } else if(U_FAILURE(*pErrorCode)) { 2669 /* callback(illegal) */ 2670 if(byteIndex>1) { 2671 /* 2672 * Ticket 5691: consistent illegal sequences: 2673 * - We include at least the first byte in the illegal sequence. 2674 * - If any of the non-initial bytes could be the start of a character, 2675 * we stop the illegal sequence before the first one of those. 2676 */ 2677 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2678 int8_t i; 2679 for(i=1; 2680 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2681 ++i) {} 2682 if(i<byteIndex) { 2683 /* Back out some bytes. */ 2684 int8_t backOutDistance=byteIndex-i; 2685 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2686 byteIndex=i; /* length of reported illegal byte sequence */ 2687 if(backOutDistance<=bytesFromThisBuffer) { 2688 source-=backOutDistance; 2689 } else { 2690 /* Back out bytes from the previous buffer: Need to replay them. */ 2691 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2692 /* preToULength is negative! */ 2693 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2694 source=(const uint8_t *)pArgs->source; 2695 } 2696 } 2697 } 2698 break; 2699 } else /* unassigned sequences indicated with byteIndex>0 */ { 2700 /* try an extension mapping */ 2701 pArgs->source=(const char *)source; 2702 byteIndex=_extToU(cnv, cnv->sharedData, 2703 byteIndex, &source, sourceLimit, 2704 &target, targetLimit, 2705 &offsets, sourceIndex, 2706 pArgs->flush, 2707 pErrorCode); 2708 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2709 2710 if(U_FAILURE(*pErrorCode)) { 2711 /* not mappable or buffer overflow */ 2712 break; 2713 } 2714 } 2715 } 2716 2717 /* set the converter state back into UConverter */ 2718 cnv->toUnicodeStatus=offset; 2719 cnv->mode=state; 2720 cnv->toULength=byteIndex; 2721 2722 /* write back the updated pointers */ 2723 pArgs->source=(const char *)source; 2724 pArgs->target=target; 2725 pArgs->offsets=offsets; 2726} 2727 2728/* 2729 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2730 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2731 */ 2732static UChar32 2733ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2734 UErrorCode *pErrorCode) { 2735 UConverter *cnv; 2736 const int32_t (*stateTable)[256]; 2737 const uint8_t *source, *sourceLimit; 2738 2739 int32_t entry; 2740 uint8_t action; 2741 2742 /* set up the local pointers */ 2743 cnv=pArgs->converter; 2744 source=(const uint8_t *)pArgs->source; 2745 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2746 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2747 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2748 } else { 2749 stateTable=cnv->sharedData->mbcs.stateTable; 2750 } 2751 2752 /* conversion loop */ 2753 while(source<sourceLimit) { 2754 entry=stateTable[0][*source++]; 2755 /* MBCS_ENTRY_IS_FINAL(entry) */ 2756 2757 /* write back the updated pointer early so that we can return directly */ 2758 pArgs->source=(const char *)source; 2759 2760 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2761 /* output BMP code point */ 2762 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2763 } 2764 2765 /* 2766 * An if-else-if chain provides more reliable performance for 2767 * the most common cases compared to a switch. 2768 */ 2769 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2770 if( action==MBCS_STATE_VALID_DIRECT_20 || 2771 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2772 ) { 2773 /* output supplementary code point */ 2774 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2775 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2776 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2777 /* output BMP code point */ 2778 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2779 } 2780 } else if(action==MBCS_STATE_UNASSIGNED) { 2781 /* just fall through */ 2782 } else if(action==MBCS_STATE_ILLEGAL) { 2783 /* callback(illegal) */ 2784 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2785 } else { 2786 /* reserved, must never occur */ 2787 continue; 2788 } 2789 2790 if(U_FAILURE(*pErrorCode)) { 2791 /* callback(illegal) */ 2792 break; 2793 } else /* unassigned sequence */ { 2794 /* defer to the generic implementation */ 2795 pArgs->source=(const char *)source-1; 2796 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2797 } 2798 } 2799 2800 /* no output because of empty input or only state changes */ 2801 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2802 return 0xffff; 2803} 2804 2805/* 2806 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2807 * conversion without offset handling. 2808 * 2809 * When a character does not have a mapping to Unicode, then we return to the 2810 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2811 * handling. 2812 * We also defer to the generic code in other complicated cases and have them 2813 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2814 * 2815 * All normal mappings and errors are handled here. 2816 */ 2817static UChar32 2818ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2819 UErrorCode *pErrorCode) { 2820 UConverter *cnv; 2821 const uint8_t *source, *sourceLimit, *lastSource; 2822 2823 const int32_t (*stateTable)[256]; 2824 const uint16_t *unicodeCodeUnits; 2825 2826 uint32_t offset; 2827 uint8_t state; 2828 2829 int32_t entry; 2830 UChar32 c; 2831 uint8_t action; 2832 2833 /* use optimized function if possible */ 2834 cnv=pArgs->converter; 2835 2836 if(cnv->preToULength>0) { 2837 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2838 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2839 } 2840 2841 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2842 /* 2843 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2844 * with the rare case of a codepage that maps single surrogates 2845 * without adding the complexity to this already complicated function here. 2846 */ 2847 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2848 } else if(cnv->sharedData->mbcs.countStates==1) { 2849 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2850 } 2851 2852 /* set up the local pointers */ 2853 source=lastSource=(const uint8_t *)pArgs->source; 2854 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2855 2856 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2857 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2858 } else { 2859 stateTable=cnv->sharedData->mbcs.stateTable; 2860 } 2861 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2862 2863 /* get the converter state from UConverter */ 2864 offset=cnv->toUnicodeStatus; 2865 2866 /* 2867 * if we are in the SBCS state for a DBCS-only converter, 2868 * then load the DBCS state from the MBCS data 2869 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2870 */ 2871 if((state=(uint8_t)(cnv->mode))==0) { 2872 state=cnv->sharedData->mbcs.dbcsOnlyState; 2873 } 2874 2875 /* conversion loop */ 2876 c=U_SENTINEL; 2877 while(source<sourceLimit) { 2878 entry=stateTable[state][*source++]; 2879 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2880 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2881 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2882 2883 /* optimization for 1/2-byte input and BMP output */ 2884 if( source<sourceLimit && 2885 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2886 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2887 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2888 ) { 2889 ++source; 2890 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2891 /* output BMP code point */ 2892 break; 2893 } 2894 } else { 2895 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2896 cnv->mode=state; 2897 2898 /* set the next state early so that we can reuse the entry variable */ 2899 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2900 2901 /* 2902 * An if-else-if chain provides more reliable performance for 2903 * the most common cases compared to a switch. 2904 */ 2905 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2906 if(action==MBCS_STATE_VALID_DIRECT_16) { 2907 /* output BMP code point */ 2908 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2909 break; 2910 } else if(action==MBCS_STATE_VALID_16) { 2911 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2912 c=unicodeCodeUnits[offset]; 2913 if(c<0xfffe) { 2914 /* output BMP code point */ 2915 break; 2916 } else if(c==0xfffe) { 2917 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2918 break; 2919 } 2920 } else { 2921 /* callback(illegal) */ 2922 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2923 } 2924 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2925 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2926 c=unicodeCodeUnits[offset++]; 2927 if(c<0xd800) { 2928 /* output BMP code point below 0xd800 */ 2929 break; 2930 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2931 /* output roundtrip or fallback supplementary code point */ 2932 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 2933 break; 2934 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2935 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2936 c=unicodeCodeUnits[offset]; 2937 break; 2938 } else if(c==0xffff) { 2939 /* callback(illegal) */ 2940 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2941 } 2942 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2943 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2944 ) { 2945 /* output supplementary code point */ 2946 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2947 break; 2948 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2949 /* 2950 * This serves as a state change without any output. 2951 * It is useful for reading simple stateful encodings, 2952 * for example using just Shift-In/Shift-Out codes. 2953 * The 21 unused bits may later be used for more sophisticated 2954 * state transitions. 2955 */ 2956 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 2957 /* SI/SO are illegal for DBCS-only conversion */ 2958 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2959 2960 /* callback(illegal) */ 2961 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2962 } 2963 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2964 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2965 /* output BMP code point */ 2966 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2967 break; 2968 } 2969 } else if(action==MBCS_STATE_UNASSIGNED) { 2970 /* just fall through */ 2971 } else if(action==MBCS_STATE_ILLEGAL) { 2972 /* callback(illegal) */ 2973 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2974 } else { 2975 /* reserved (must never occur), or only state change */ 2976 offset=0; 2977 lastSource=source; 2978 continue; 2979 } 2980 2981 /* end of action codes: prepare for a new character */ 2982 offset=0; 2983 2984 if(U_FAILURE(*pErrorCode)) { 2985 /* callback(illegal) */ 2986 break; 2987 } else /* unassigned sequence */ { 2988 /* defer to the generic implementation */ 2989 cnv->toUnicodeStatus=0; 2990 cnv->mode=state; 2991 pArgs->source=(const char *)lastSource; 2992 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2993 } 2994 } 2995 } 2996 2997 if(c<0) { 2998 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 2999 /* incomplete character byte sequence */ 3000 uint8_t *bytes=cnv->toUBytes; 3001 cnv->toULength=(int8_t)(source-lastSource); 3002 do { 3003 *bytes++=*lastSource++; 3004 } while(lastSource<source); 3005 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3006 } else if(U_FAILURE(*pErrorCode)) { 3007 /* callback(illegal) */ 3008 /* 3009 * Ticket 5691: consistent illegal sequences: 3010 * - We include at least the first byte in the illegal sequence. 3011 * - If any of the non-initial bytes could be the start of a character, 3012 * we stop the illegal sequence before the first one of those. 3013 */ 3014 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3015 uint8_t *bytes=cnv->toUBytes; 3016 *bytes++=*lastSource++; /* first byte */ 3017 if(lastSource==source) { 3018 cnv->toULength=1; 3019 } else /* lastSource<source: multi-byte character */ { 3020 int8_t i; 3021 for(i=1; 3022 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3023 ++i 3024 ) { 3025 *bytes++=*lastSource++; 3026 } 3027 cnv->toULength=i; 3028 source=lastSource; 3029 } 3030 } else { 3031 /* no output because of empty input or only state changes */ 3032 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3033 } 3034 c=0xffff; 3035 } 3036 3037 /* set the converter state back into UConverter, ready for a new character */ 3038 cnv->toUnicodeStatus=0; 3039 cnv->mode=state; 3040 3041 /* write back the updated pointer */ 3042 pArgs->source=(const char *)source; 3043 return c; 3044} 3045 3046#if 0 3047/* 3048 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3049 * Removal improves code coverage. 3050 */ 3051/** 3052 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3053 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3054 * It does not handle conversion extensions (_extToU()). 3055 */ 3056U_CFUNC UChar32 3057ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3058 uint8_t b, UBool useFallback) { 3059 int32_t entry; 3060 uint8_t action; 3061 3062 entry=sharedData->mbcs.stateTable[0][b]; 3063 /* MBCS_ENTRY_IS_FINAL(entry) */ 3064 3065 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3066 /* output BMP code point */ 3067 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3068 } 3069 3070 /* 3071 * An if-else-if chain provides more reliable performance for 3072 * the most common cases compared to a switch. 3073 */ 3074 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3075 if(action==MBCS_STATE_VALID_DIRECT_20) { 3076 /* output supplementary code point */ 3077 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3078 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3079 if(!TO_U_USE_FALLBACK(useFallback)) { 3080 return 0xfffe; 3081 } 3082 /* output BMP code point */ 3083 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3084 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3085 if(!TO_U_USE_FALLBACK(useFallback)) { 3086 return 0xfffe; 3087 } 3088 /* output supplementary code point */ 3089 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3090 } else if(action==MBCS_STATE_UNASSIGNED) { 3091 return 0xfffe; 3092 } else if(action==MBCS_STATE_ILLEGAL) { 3093 return 0xffff; 3094 } else { 3095 /* reserved, must never occur */ 3096 return 0xffff; 3097 } 3098} 3099#endif 3100 3101/* 3102 * This is a simple version of _MBCSGetNextUChar() that is used 3103 * by other converter implementations. 3104 * It only returns an "assigned" result if it consumes the entire input. 3105 * It does not use state from the converter, nor error codes. 3106 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3107 * It handles conversion extensions but not GB 18030. 3108 * 3109 * Return value: 3110 * U+fffe unassigned 3111 * U+ffff illegal 3112 * otherwise the Unicode code point 3113 */ 3114U_CFUNC UChar32 3115ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3116 const char *source, int32_t length, 3117 UBool useFallback) { 3118 const int32_t (*stateTable)[256]; 3119 const uint16_t *unicodeCodeUnits; 3120 3121 uint32_t offset; 3122 uint8_t state, action; 3123 3124 UChar32 c; 3125 int32_t i, entry; 3126 3127 if(length<=0) { 3128 /* no input at all: "illegal" */ 3129 return 0xffff; 3130 } 3131 3132#if 0 3133/* 3134 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3135 * TODO In future releases, verify that this function is never called for SBCS 3136 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3137 * Removal improves code coverage. 3138 */ 3139 /* use optimized function if possible */ 3140 if(sharedData->mbcs.countStates==1) { 3141 if(length==1) { 3142 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3143 } else { 3144 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3145 } 3146 } 3147#endif 3148 3149 /* set up the local pointers */ 3150 stateTable=sharedData->mbcs.stateTable; 3151 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3152 3153 /* converter state */ 3154 offset=0; 3155 state=sharedData->mbcs.dbcsOnlyState; 3156 3157 /* conversion loop */ 3158 for(i=0;;) { 3159 entry=stateTable[state][(uint8_t)source[i++]]; 3160 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3161 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3162 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3163 3164 if(i==length) { 3165 return 0xffff; /* truncated character */ 3166 } 3167 } else { 3168 /* 3169 * An if-else-if chain provides more reliable performance for 3170 * the most common cases compared to a switch. 3171 */ 3172 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3173 if(action==MBCS_STATE_VALID_16) { 3174 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3175 c=unicodeCodeUnits[offset]; 3176 if(c!=0xfffe) { 3177 /* done */ 3178 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3179 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3180 /* else done with 0xfffe */ 3181 } 3182 break; 3183 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3184 /* output BMP code point */ 3185 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3186 break; 3187 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3188 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3189 c=unicodeCodeUnits[offset++]; 3190 if(c<0xd800) { 3191 /* output BMP code point below 0xd800 */ 3192 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3193 /* output roundtrip or fallback supplementary code point */ 3194 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3195 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3196 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3197 c=unicodeCodeUnits[offset]; 3198 } else if(c==0xffff) { 3199 return 0xffff; 3200 } else { 3201 c=0xfffe; 3202 } 3203 break; 3204 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3205 /* output supplementary code point */ 3206 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3207 break; 3208 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3209 if(!TO_U_USE_FALLBACK(useFallback)) { 3210 c=0xfffe; 3211 break; 3212 } 3213 /* output BMP code point */ 3214 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3215 break; 3216 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3217 if(!TO_U_USE_FALLBACK(useFallback)) { 3218 c=0xfffe; 3219 break; 3220 } 3221 /* output supplementary code point */ 3222 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3223 break; 3224 } else if(action==MBCS_STATE_UNASSIGNED) { 3225 c=0xfffe; 3226 break; 3227 } 3228 3229 /* 3230 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3231 * and MBCS_STATE_ILLEGAL and reserved action codes 3232 */ 3233 return 0xffff; 3234 } 3235 } 3236 3237 if(i!=length) { 3238 /* illegal for this function: not all input consumed */ 3239 return 0xffff; 3240 } 3241 3242 if(c==0xfffe) { 3243 /* try an extension mapping */ 3244 const int32_t *cx=sharedData->mbcs.extIndexes; 3245 if(cx!=NULL) { 3246 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3247 } 3248 } 3249 3250 return c; 3251} 3252 3253/* MBCS-from-Unicode conversion functions ----------------------------------- */ 3254 3255/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3256static void 3257ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3258 UErrorCode *pErrorCode) { 3259 UConverter *cnv; 3260 const UChar *source, *sourceLimit; 3261 uint8_t *target; 3262 int32_t targetCapacity; 3263 int32_t *offsets; 3264 3265 const uint16_t *table; 3266 const uint16_t *mbcsIndex; 3267 const uint8_t *bytes; 3268 3269 UChar32 c; 3270 3271 int32_t sourceIndex, nextSourceIndex; 3272 3273 uint32_t stage2Entry; 3274 uint32_t asciiRoundtrips; 3275 uint32_t value; 3276 uint8_t unicodeMask; 3277 3278 /* use optimized function if possible */ 3279 cnv=pArgs->converter; 3280 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3281 3282 /* set up the local pointers */ 3283 source=pArgs->source; 3284 sourceLimit=pArgs->sourceLimit; 3285 target=(uint8_t *)pArgs->target; 3286 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3287 offsets=pArgs->offsets; 3288 3289 table=cnv->sharedData->mbcs.fromUnicodeTable; 3290 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3291 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3292 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3293 } else { 3294 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3295 } 3296 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3297 3298 /* get the converter state from UConverter */ 3299 c=cnv->fromUChar32; 3300 3301 /* sourceIndex=-1 if the current character began in the previous buffer */ 3302 sourceIndex= c==0 ? 0 : -1; 3303 nextSourceIndex=0; 3304 3305 /* conversion loop */ 3306 if(c!=0 && targetCapacity>0) { 3307 goto getTrail; 3308 } 3309 3310 while(source<sourceLimit) { 3311 /* 3312 * This following test is to see if available input would overflow the output. 3313 * It does not catch output of more than one byte that 3314 * overflows as a result of a multi-byte character or callback output 3315 * from the last source character. 3316 * Therefore, those situations also test for overflows and will 3317 * then break the loop, too. 3318 */ 3319 if(targetCapacity>0) { 3320 /* 3321 * Get a correct Unicode code point: 3322 * a single UChar for a BMP code point or 3323 * a matched surrogate pair for a "supplementary code point". 3324 */ 3325 c=*source++; 3326 ++nextSourceIndex; 3327 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3328 *target++=(uint8_t)c; 3329 if(offsets!=NULL) { 3330 *offsets++=sourceIndex; 3331 sourceIndex=nextSourceIndex; 3332 } 3333 --targetCapacity; 3334 c=0; 3335 continue; 3336 } 3337 /* 3338 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3339 * to avoid dealing with surrogates. 3340 * MBCS_FAST_MAX must be >=0xd7ff. 3341 */ 3342 if(c<=0xd7ff) { 3343 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3344 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3345 if(value==0) { 3346 goto unassigned; 3347 } 3348 /* output the value */ 3349 } else { 3350 /* 3351 * This also tests if the codepage maps single surrogates. 3352 * If it does, then surrogates are not paired but mapped separately. 3353 * Note that in this case unmatched surrogates are not detected. 3354 */ 3355 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3356 if(UTF_IS_SURROGATE_FIRST(c)) { 3357getTrail: 3358 if(source<sourceLimit) { 3359 /* test the following code unit */ 3360 UChar trail=*source; 3361 if(UTF_IS_SECOND_SURROGATE(trail)) { 3362 ++source; 3363 ++nextSourceIndex; 3364 c=UTF16_GET_PAIR_VALUE(c, trail); 3365 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3366 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3367 /* callback(unassigned) */ 3368 goto unassigned; 3369 } 3370 /* convert this supplementary code point */ 3371 /* exit this condition tree */ 3372 } else { 3373 /* this is an unmatched lead code unit (1st surrogate) */ 3374 /* callback(illegal) */ 3375 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3376 break; 3377 } 3378 } else { 3379 /* no more input */ 3380 break; 3381 } 3382 } else { 3383 /* this is an unmatched trail code unit (2nd surrogate) */ 3384 /* callback(illegal) */ 3385 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3386 break; 3387 } 3388 } 3389 3390 /* convert the Unicode code point in c into codepage bytes */ 3391 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3392 3393 /* get the bytes and the length for the output */ 3394 /* MBCS_OUTPUT_2 */ 3395 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3396 3397 /* is this code point assigned, or do we use fallbacks? */ 3398 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3399 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3400 ) { 3401 /* 3402 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3403 * There is no way with this data structure for fallback output 3404 * to be a zero byte. 3405 */ 3406 3407unassigned: 3408 /* try an extension mapping */ 3409 pArgs->source=source; 3410 c=_extFromU(cnv, cnv->sharedData, 3411 c, &source, sourceLimit, 3412 &target, target+targetCapacity, 3413 &offsets, sourceIndex, 3414 pArgs->flush, 3415 pErrorCode); 3416 nextSourceIndex+=(int32_t)(source-pArgs->source); 3417 3418 if(U_FAILURE(*pErrorCode)) { 3419 /* not mappable or buffer overflow */ 3420 break; 3421 } else { 3422 /* a mapping was written to the target, continue */ 3423 3424 /* recalculate the targetCapacity after an extension mapping */ 3425 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3426 3427 /* normal end of conversion: prepare for a new character */ 3428 sourceIndex=nextSourceIndex; 3429 continue; 3430 } 3431 } 3432 } 3433 3434 /* write the output character bytes from value and length */ 3435 /* from the first if in the loop we know that targetCapacity>0 */ 3436 if(value<=0xff) { 3437 /* this is easy because we know that there is enough space */ 3438 *target++=(uint8_t)value; 3439 if(offsets!=NULL) { 3440 *offsets++=sourceIndex; 3441 } 3442 --targetCapacity; 3443 } else /* length==2 */ { 3444 *target++=(uint8_t)(value>>8); 3445 if(2<=targetCapacity) { 3446 *target++=(uint8_t)value; 3447 if(offsets!=NULL) { 3448 *offsets++=sourceIndex; 3449 *offsets++=sourceIndex; 3450 } 3451 targetCapacity-=2; 3452 } else { 3453 if(offsets!=NULL) { 3454 *offsets++=sourceIndex; 3455 } 3456 cnv->charErrorBuffer[0]=(char)value; 3457 cnv->charErrorBufferLength=1; 3458 3459 /* target overflow */ 3460 targetCapacity=0; 3461 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3462 c=0; 3463 break; 3464 } 3465 } 3466 3467 /* normal end of conversion: prepare for a new character */ 3468 c=0; 3469 sourceIndex=nextSourceIndex; 3470 continue; 3471 } else { 3472 /* target is full */ 3473 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3474 break; 3475 } 3476 } 3477 3478 /* set the converter state back into UConverter */ 3479 cnv->fromUChar32=c; 3480 3481 /* write back the updated pointers */ 3482 pArgs->source=source; 3483 pArgs->target=(char *)target; 3484 pArgs->offsets=offsets; 3485} 3486 3487/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3488static void 3489ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3490 UErrorCode *pErrorCode) { 3491 UConverter *cnv; 3492 const UChar *source, *sourceLimit; 3493 uint8_t *target; 3494 int32_t targetCapacity; 3495 int32_t *offsets; 3496 3497 const uint16_t *table; 3498 const uint16_t *results; 3499 3500 UChar32 c; 3501 3502 int32_t sourceIndex, nextSourceIndex; 3503 3504 uint16_t value, minValue; 3505 UBool hasSupplementary; 3506 3507 /* set up the local pointers */ 3508 cnv=pArgs->converter; 3509 source=pArgs->source; 3510 sourceLimit=pArgs->sourceLimit; 3511 target=(uint8_t *)pArgs->target; 3512 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3513 offsets=pArgs->offsets; 3514 3515 table=cnv->sharedData->mbcs.fromUnicodeTable; 3516 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3517 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3518 } else { 3519 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3520 } 3521 3522 if(cnv->useFallback) { 3523 /* use all roundtrip and fallback results */ 3524 minValue=0x800; 3525 } else { 3526 /* use only roundtrips and fallbacks from private-use characters */ 3527 minValue=0xc00; 3528 } 3529 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3530 3531 /* get the converter state from UConverter */ 3532 c=cnv->fromUChar32; 3533 3534 /* sourceIndex=-1 if the current character began in the previous buffer */ 3535 sourceIndex= c==0 ? 0 : -1; 3536 nextSourceIndex=0; 3537 3538 /* conversion loop */ 3539 if(c!=0 && targetCapacity>0) { 3540 goto getTrail; 3541 } 3542 3543 while(source<sourceLimit) { 3544 /* 3545 * This following test is to see if available input would overflow the output. 3546 * It does not catch output of more than one byte that 3547 * overflows as a result of a multi-byte character or callback output 3548 * from the last source character. 3549 * Therefore, those situations also test for overflows and will 3550 * then break the loop, too. 3551 */ 3552 if(targetCapacity>0) { 3553 /* 3554 * Get a correct Unicode code point: 3555 * a single UChar for a BMP code point or 3556 * a matched surrogate pair for a "supplementary code point". 3557 */ 3558 c=*source++; 3559 ++nextSourceIndex; 3560 if(UTF_IS_SURROGATE(c)) { 3561 if(UTF_IS_SURROGATE_FIRST(c)) { 3562getTrail: 3563 if(source<sourceLimit) { 3564 /* test the following code unit */ 3565 UChar trail=*source; 3566 if(UTF_IS_SECOND_SURROGATE(trail)) { 3567 ++source; 3568 ++nextSourceIndex; 3569 c=UTF16_GET_PAIR_VALUE(c, trail); 3570 if(!hasSupplementary) { 3571 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3572 /* callback(unassigned) */ 3573 goto unassigned; 3574 } 3575 /* convert this supplementary code point */ 3576 /* exit this condition tree */ 3577 } else { 3578 /* this is an unmatched lead code unit (1st surrogate) */ 3579 /* callback(illegal) */ 3580 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3581 break; 3582 } 3583 } else { 3584 /* no more input */ 3585 break; 3586 } 3587 } else { 3588 /* this is an unmatched trail code unit (2nd surrogate) */ 3589 /* callback(illegal) */ 3590 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3591 break; 3592 } 3593 } 3594 3595 /* convert the Unicode code point in c into codepage bytes */ 3596 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3597 3598 /* is this code point assigned, or do we use fallbacks? */ 3599 if(value>=minValue) { 3600 /* assigned, write the output character bytes from value and length */ 3601 /* length==1 */ 3602 /* this is easy because we know that there is enough space */ 3603 *target++=(uint8_t)value; 3604 if(offsets!=NULL) { 3605 *offsets++=sourceIndex; 3606 } 3607 --targetCapacity; 3608 3609 /* normal end of conversion: prepare for a new character */ 3610 c=0; 3611 sourceIndex=nextSourceIndex; 3612 } else { /* unassigned */ 3613unassigned: 3614 /* try an extension mapping */ 3615 pArgs->source=source; 3616 c=_extFromU(cnv, cnv->sharedData, 3617 c, &source, sourceLimit, 3618 &target, target+targetCapacity, 3619 &offsets, sourceIndex, 3620 pArgs->flush, 3621 pErrorCode); 3622 nextSourceIndex+=(int32_t)(source-pArgs->source); 3623 3624 if(U_FAILURE(*pErrorCode)) { 3625 /* not mappable or buffer overflow */ 3626 break; 3627 } else { 3628 /* a mapping was written to the target, continue */ 3629 3630 /* recalculate the targetCapacity after an extension mapping */ 3631 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3632 3633 /* normal end of conversion: prepare for a new character */ 3634 sourceIndex=nextSourceIndex; 3635 } 3636 } 3637 } else { 3638 /* target is full */ 3639 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3640 break; 3641 } 3642 } 3643 3644 /* set the converter state back into UConverter */ 3645 cnv->fromUChar32=c; 3646 3647 /* write back the updated pointers */ 3648 pArgs->source=source; 3649 pArgs->target=(char *)target; 3650 pArgs->offsets=offsets; 3651} 3652 3653/* 3654 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3655 * that map only to and from the BMP. 3656 * In addition to single-byte/state optimizations, the offset calculations 3657 * become much easier. 3658 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3659 * but measurements have shown that this diminishes performance 3660 * in more cases than it improves it. 3661 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3662 * for various MBCS and SBCS optimizations. 3663 */ 3664static void 3665ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3666 UErrorCode *pErrorCode) { 3667 UConverter *cnv; 3668 const UChar *source, *sourceLimit, *lastSource; 3669 uint8_t *target; 3670 int32_t targetCapacity, length; 3671 int32_t *offsets; 3672 3673 const uint16_t *table; 3674 const uint16_t *results; 3675 3676 UChar32 c; 3677 3678 int32_t sourceIndex; 3679 3680 uint32_t asciiRoundtrips; 3681 uint16_t value, minValue; 3682 3683 /* set up the local pointers */ 3684 cnv=pArgs->converter; 3685 source=pArgs->source; 3686 sourceLimit=pArgs->sourceLimit; 3687 target=(uint8_t *)pArgs->target; 3688 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3689 offsets=pArgs->offsets; 3690 3691 table=cnv->sharedData->mbcs.fromUnicodeTable; 3692 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3693 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3694 } else { 3695 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3696 } 3697 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3698 3699 if(cnv->useFallback) { 3700 /* use all roundtrip and fallback results */ 3701 minValue=0x800; 3702 } else { 3703 /* use only roundtrips and fallbacks from private-use characters */ 3704 minValue=0xc00; 3705 } 3706 3707 /* get the converter state from UConverter */ 3708 c=cnv->fromUChar32; 3709 3710 /* sourceIndex=-1 if the current character began in the previous buffer */ 3711 sourceIndex= c==0 ? 0 : -1; 3712 lastSource=source; 3713 3714 /* 3715 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3716 * for the minimum of the sourceLength and targetCapacity 3717 */ 3718 length=(int32_t)(sourceLimit-source); 3719 if(length<targetCapacity) { 3720 targetCapacity=length; 3721 } 3722 3723 /* conversion loop */ 3724 if(c!=0 && targetCapacity>0) { 3725 goto getTrail; 3726 } 3727 3728#if MBCS_UNROLL_SINGLE_FROM_BMP 3729 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3730 /* unroll the loop with the most common case */ 3731unrolled: 3732 if(targetCapacity>=4) { 3733 int32_t count, loops; 3734 uint16_t andedValues; 3735 3736 loops=count=targetCapacity>>2; 3737 do { 3738 c=*source++; 3739 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3740 *target++=(uint8_t)value; 3741 c=*source++; 3742 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3743 *target++=(uint8_t)value; 3744 c=*source++; 3745 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3746 *target++=(uint8_t)value; 3747 c=*source++; 3748 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3749 *target++=(uint8_t)value; 3750 3751 /* were all 4 entries really valid? */ 3752 if(andedValues<minValue) { 3753 /* no, return to the first of these 4 */ 3754 source-=4; 3755 target-=4; 3756 break; 3757 } 3758 } while(--count>0); 3759 count=loops-count; 3760 targetCapacity-=4*count; 3761 3762 if(offsets!=NULL) { 3763 lastSource+=4*count; 3764 while(count>0) { 3765 *offsets++=sourceIndex++; 3766 *offsets++=sourceIndex++; 3767 *offsets++=sourceIndex++; 3768 *offsets++=sourceIndex++; 3769 --count; 3770 } 3771 } 3772 3773 c=0; 3774 } 3775#endif 3776 3777 while(targetCapacity>0) { 3778 /* 3779 * Get a correct Unicode code point: 3780 * a single UChar for a BMP code point or 3781 * a matched surrogate pair for a "supplementary code point". 3782 */ 3783 c=*source++; 3784 /* 3785 * Do not immediately check for single surrogates: 3786 * Assume that they are unassigned and check for them in that case. 3787 * This speeds up the conversion of assigned characters. 3788 */ 3789 /* convert the Unicode code point in c into codepage bytes */ 3790 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3791 *target++=(uint8_t)c; 3792 --targetCapacity; 3793 c=0; 3794 continue; 3795 } 3796 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3797 /* is this code point assigned, or do we use fallbacks? */ 3798 if(value>=minValue) { 3799 /* assigned, write the output character bytes from value and length */ 3800 /* length==1 */ 3801 /* this is easy because we know that there is enough space */ 3802 *target++=(uint8_t)value; 3803 --targetCapacity; 3804 3805 /* normal end of conversion: prepare for a new character */ 3806 c=0; 3807 continue; 3808 } else if(!UTF_IS_SURROGATE(c)) { 3809 /* normal, unassigned BMP character */ 3810 } else if(UTF_IS_SURROGATE_FIRST(c)) { 3811getTrail: 3812 if(source<sourceLimit) { 3813 /* test the following code unit */ 3814 UChar trail=*source; 3815 if(UTF_IS_SECOND_SURROGATE(trail)) { 3816 ++source; 3817 c=UTF16_GET_PAIR_VALUE(c, trail); 3818 /* this codepage does not map supplementary code points */ 3819 /* callback(unassigned) */ 3820 } else { 3821 /* this is an unmatched lead code unit (1st surrogate) */ 3822 /* callback(illegal) */ 3823 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3824 break; 3825 } 3826 } else { 3827 /* no more input */ 3828 if (pArgs->flush) { 3829 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3830 } 3831 break; 3832 } 3833 } else { 3834 /* this is an unmatched trail code unit (2nd surrogate) */ 3835 /* callback(illegal) */ 3836 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3837 break; 3838 } 3839 3840 /* c does not have a mapping */ 3841 3842 /* get the number of code units for c to correctly advance sourceIndex */ 3843 length=U16_LENGTH(c); 3844 3845 /* set offsets since the start or the last extension */ 3846 if(offsets!=NULL) { 3847 int32_t count=(int32_t)(source-lastSource); 3848 3849 /* do not set the offset for this character */ 3850 count-=length; 3851 3852 while(count>0) { 3853 *offsets++=sourceIndex++; 3854 --count; 3855 } 3856 /* offsets and sourceIndex are now set for the current character */ 3857 } 3858 3859 /* try an extension mapping */ 3860 lastSource=source; 3861 c=_extFromU(cnv, cnv->sharedData, 3862 c, &source, sourceLimit, 3863 &target, (const uint8_t *)(pArgs->targetLimit), 3864 &offsets, sourceIndex, 3865 pArgs->flush, 3866 pErrorCode); 3867 sourceIndex+=length+(int32_t)(source-lastSource); 3868 lastSource=source; 3869 3870 if(U_FAILURE(*pErrorCode)) { 3871 /* not mappable or buffer overflow */ 3872 break; 3873 } else { 3874 /* a mapping was written to the target, continue */ 3875 3876 /* recalculate the targetCapacity after an extension mapping */ 3877 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3878 length=(int32_t)(sourceLimit-source); 3879 if(length<targetCapacity) { 3880 targetCapacity=length; 3881 } 3882 } 3883 3884#if MBCS_UNROLL_SINGLE_FROM_BMP 3885 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3886 goto unrolled; 3887#endif 3888 } 3889 3890 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 3891 /* target is full */ 3892 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3893 } 3894 3895 /* set offsets since the start or the last callback */ 3896 if(offsets!=NULL) { 3897 size_t count=source-lastSource; 3898 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 3899 /* 3900 Caller gave us a partial supplementary character, 3901 which this function couldn't convert in any case. 3902 The callback will handle the offset. 3903 */ 3904 count--; 3905 } 3906 while(count>0) { 3907 *offsets++=sourceIndex++; 3908 --count; 3909 } 3910 } 3911 3912 /* set the converter state back into UConverter */ 3913 cnv->fromUChar32=c; 3914 3915 /* write back the updated pointers */ 3916 pArgs->source=source; 3917 pArgs->target=(char *)target; 3918 pArgs->offsets=offsets; 3919} 3920 3921U_CFUNC void 3922ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3923 UErrorCode *pErrorCode) { 3924 UConverter *cnv; 3925 const UChar *source, *sourceLimit; 3926 uint8_t *target; 3927 int32_t targetCapacity; 3928 int32_t *offsets; 3929 3930 const uint16_t *table; 3931 const uint16_t *mbcsIndex; 3932 const uint8_t *p, *bytes; 3933 uint8_t outputType; 3934 3935 UChar32 c; 3936 3937 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 3938 3939 uint32_t stage2Entry; 3940 uint32_t asciiRoundtrips; 3941 uint32_t value; 3942 uint8_t si_value[2] = {0, 0}; 3943 uint8_t so_value[2] = {0, 0}; 3944 uint8_t si_value_length, so_value_length; 3945 int32_t length = 0, prevLength; 3946 uint8_t unicodeMask; 3947 3948 cnv=pArgs->converter; 3949 3950 if(cnv->preFromUFirstCP>=0) { 3951 /* 3952 * pass sourceIndex=-1 because we continue from an earlier buffer 3953 * in the future, this may change with continuous offsets 3954 */ 3955 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 3956 3957 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 3958 return; 3959 } 3960 } 3961 3962 /* use optimized function if possible */ 3963 outputType=cnv->sharedData->mbcs.outputType; 3964 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3965 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3966 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3967 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 3968 } else { 3969 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 3970 } 3971 return; 3972 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 3973 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 3974 return; 3975 } 3976 3977 /* set up the local pointers */ 3978 source=pArgs->source; 3979 sourceLimit=pArgs->sourceLimit; 3980 target=(uint8_t *)pArgs->target; 3981 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3982 offsets=pArgs->offsets; 3983 3984 table=cnv->sharedData->mbcs.fromUnicodeTable; 3985 if(cnv->sharedData->mbcs.utf8Friendly) { 3986 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3987 } else { 3988 mbcsIndex=NULL; 3989 } 3990 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3991 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3992 } else { 3993 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3994 } 3995 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3996 3997 /* get the converter state from UConverter */ 3998 c=cnv->fromUChar32; 3999 4000 if(outputType==MBCS_OUTPUT_2_SISO) { 4001 prevLength=cnv->fromUnicodeStatus; 4002 if(prevLength==0) { 4003 /* set the real value */ 4004 prevLength=1; 4005 } 4006 } else { 4007 /* prevent fromUnicodeStatus from being set to something non-0 */ 4008 prevLength=0; 4009 } 4010 4011 /* sourceIndex=-1 if the current character began in the previous buffer */ 4012 prevSourceIndex=-1; 4013 sourceIndex= c==0 ? 0 : -1; 4014 nextSourceIndex=0; 4015 4016 /* Get the SI/SO character for the converter */ 4017 si_value_length = getSISOBytes(SI, cnv->options, si_value); 4018 so_value_length = getSISOBytes(SO, cnv->options, so_value); 4019 4020 /* conversion loop */ 4021 /* 4022 * This is another piece of ugly code: 4023 * A goto into the loop if the converter state contains a first surrogate 4024 * from the previous function call. 4025 * It saves me to check in each loop iteration a check of if(c==0) 4026 * and duplicating the trail-surrogate-handling code in the else 4027 * branch of that check. 4028 * I could not find any other way to get around this other than 4029 * using a function call for the conversion and callback, which would 4030 * be even more inefficient. 4031 * 4032 * Markus Scherer 2000-jul-19 4033 */ 4034 if(c!=0 && targetCapacity>0) { 4035 goto getTrail; 4036 } 4037 4038 while(source<sourceLimit) { 4039 /* 4040 * This following test is to see if available input would overflow the output. 4041 * It does not catch output of more than one byte that 4042 * overflows as a result of a multi-byte character or callback output 4043 * from the last source character. 4044 * Therefore, those situations also test for overflows and will 4045 * then break the loop, too. 4046 */ 4047 if(targetCapacity>0) { 4048 /* 4049 * Get a correct Unicode code point: 4050 * a single UChar for a BMP code point or 4051 * a matched surrogate pair for a "supplementary code point". 4052 */ 4053 c=*source++; 4054 ++nextSourceIndex; 4055 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4056 *target++=(uint8_t)c; 4057 if(offsets!=NULL) { 4058 *offsets++=sourceIndex; 4059 prevSourceIndex=sourceIndex; 4060 sourceIndex=nextSourceIndex; 4061 } 4062 --targetCapacity; 4063 c=0; 4064 continue; 4065 } 4066 /* 4067 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4068 * to avoid dealing with surrogates. 4069 * MBCS_FAST_MAX must be >=0xd7ff. 4070 */ 4071 if(c<=0xd7ff && mbcsIndex!=NULL) { 4072 value=mbcsIndex[c>>6]; 4073 4074 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4075 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4076 switch(outputType) { 4077 case MBCS_OUTPUT_2: 4078 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4079 if(value<=0xff) { 4080 if(value==0) { 4081 goto unassigned; 4082 } else { 4083 length=1; 4084 } 4085 } else { 4086 length=2; 4087 } 4088 break; 4089 case MBCS_OUTPUT_2_SISO: 4090 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4091 /* 4092 * Save the old state in the converter object 4093 * right here, then change the local prevLength state variable if necessary. 4094 * Then, if this character turns out to be unassigned or a fallback that 4095 * is not taken, the callback code must not save the new state in the converter 4096 * because the new state is for a character that is not output. 4097 * However, the callback must still restore the state from the converter 4098 * in case the callback function changed it for its output. 4099 */ 4100 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4101 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4102 if(value<=0xff) { 4103 if(value==0) { 4104 goto unassigned; 4105 } else if(prevLength<=1) { 4106 length=1; 4107 } else { 4108 /* change from double-byte mode to single-byte */ 4109 if (si_value_length == 1) { 4110 value|=(uint32_t)si_value[0]<<8; 4111 length = 2; 4112 } else if (si_value_length == 2) { 4113 value|=(uint32_t)si_value[1]<<8; 4114 value|=(uint32_t)si_value[0]<<16; 4115 length = 3; 4116 } 4117 prevLength=1; 4118 } 4119 } else { 4120 if(prevLength==2) { 4121 length=2; 4122 } else { 4123 /* change from single-byte mode to double-byte */ 4124 if (so_value_length == 1) { 4125 value|=(uint32_t)so_value[0]<<16; 4126 length = 3; 4127 } else if (so_value_length == 2) { 4128 value|=(uint32_t)so_value[1]<<16; 4129 value|=(uint32_t)so_value[0]<<24; 4130 length = 4; 4131 } 4132 prevLength=2; 4133 } 4134 } 4135 break; 4136 case MBCS_OUTPUT_DBCS_ONLY: 4137 /* table with single-byte results, but only DBCS mappings used */ 4138 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4139 if(value<=0xff) { 4140 /* no mapping or SBCS result, not taken for DBCS-only */ 4141 goto unassigned; 4142 } else { 4143 length=2; 4144 } 4145 break; 4146 case MBCS_OUTPUT_3: 4147 p=bytes+(value+(c&0x3f))*3; 4148 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4149 if(value<=0xff) { 4150 if(value==0) { 4151 goto unassigned; 4152 } else { 4153 length=1; 4154 } 4155 } else if(value<=0xffff) { 4156 length=2; 4157 } else { 4158 length=3; 4159 } 4160 break; 4161 case MBCS_OUTPUT_4: 4162 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4163 if(value<=0xff) { 4164 if(value==0) { 4165 goto unassigned; 4166 } else { 4167 length=1; 4168 } 4169 } else if(value<=0xffff) { 4170 length=2; 4171 } else if(value<=0xffffff) { 4172 length=3; 4173 } else { 4174 length=4; 4175 } 4176 break; 4177 case MBCS_OUTPUT_3_EUC: 4178 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4179 /* EUC 16-bit fixed-length representation */ 4180 if(value<=0xff) { 4181 if(value==0) { 4182 goto unassigned; 4183 } else { 4184 length=1; 4185 } 4186 } else if((value&0x8000)==0) { 4187 value|=0x8e8000; 4188 length=3; 4189 } else if((value&0x80)==0) { 4190 value|=0x8f0080; 4191 length=3; 4192 } else { 4193 length=2; 4194 } 4195 break; 4196 case MBCS_OUTPUT_4_EUC: 4197 p=bytes+(value+(c&0x3f))*3; 4198 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4199 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4200 if(value<=0xff) { 4201 if(value==0) { 4202 goto unassigned; 4203 } else { 4204 length=1; 4205 } 4206 } else if(value<=0xffff) { 4207 length=2; 4208 } else if((value&0x800000)==0) { 4209 value|=0x8e800000; 4210 length=4; 4211 } else if((value&0x8000)==0) { 4212 value|=0x8f008000; 4213 length=4; 4214 } else { 4215 length=3; 4216 } 4217 break; 4218 default: 4219 /* must not occur */ 4220 /* 4221 * To avoid compiler warnings that value & length may be 4222 * used without having been initialized, we set them here. 4223 * In reality, this is unreachable code. 4224 * Not having a default branch also causes warnings with 4225 * some compilers. 4226 */ 4227 value=0; 4228 length=0; 4229 break; 4230 } 4231 /* output the value */ 4232 } else { 4233 /* 4234 * This also tests if the codepage maps single surrogates. 4235 * If it does, then surrogates are not paired but mapped separately. 4236 * Note that in this case unmatched surrogates are not detected. 4237 */ 4238 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4239 if(UTF_IS_SURROGATE_FIRST(c)) { 4240getTrail: 4241 if(source<sourceLimit) { 4242 /* test the following code unit */ 4243 UChar trail=*source; 4244 if(UTF_IS_SECOND_SURROGATE(trail)) { 4245 ++source; 4246 ++nextSourceIndex; 4247 c=UTF16_GET_PAIR_VALUE(c, trail); 4248 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4249 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4250 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4251 /* callback(unassigned) */ 4252 goto unassigned; 4253 } 4254 /* convert this supplementary code point */ 4255 /* exit this condition tree */ 4256 } else { 4257 /* this is an unmatched lead code unit (1st surrogate) */ 4258 /* callback(illegal) */ 4259 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4260 break; 4261 } 4262 } else { 4263 /* no more input */ 4264 break; 4265 } 4266 } else { 4267 /* this is an unmatched trail code unit (2nd surrogate) */ 4268 /* callback(illegal) */ 4269 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4270 break; 4271 } 4272 } 4273 4274 /* convert the Unicode code point in c into codepage bytes */ 4275 4276 /* 4277 * The basic lookup is a triple-stage compact array (trie) lookup. 4278 * For details see the beginning of this file. 4279 * 4280 * Single-byte codepages are handled with a different data structure 4281 * by _MBCSSingle... functions. 4282 * 4283 * The result consists of a 32-bit value from stage 2 and 4284 * a pointer to as many bytes as are stored per character. 4285 * The pointer points to the character's bytes in stage 3. 4286 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4287 * for that pointer, while bits 31..16 are flags for which of 4288 * the 16 characters in the block are roundtrip-assigned. 4289 * 4290 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4291 * respectively as uint32_t, in the platform encoding. 4292 * For 3-byte codepages, the bytes are always stored in big-endian order. 4293 * 4294 * For EUC encodings that use only either 0x8e or 0x8f as the first 4295 * byte of their longest byte sequences, the first two bytes in 4296 * this third stage indicate with their 7th bits whether these bytes 4297 * are to be written directly or actually need to be preceeded by 4298 * one of the two Single-Shift codes. With this, the third stage 4299 * stores one byte fewer per character than the actual maximum length of 4300 * EUC byte sequences. 4301 * 4302 * Other than that, leading zero bytes are removed and the other 4303 * bytes output. A single zero byte may be output if the "assigned" 4304 * bit in stage 2 was on. 4305 * The data structure does not support zero byte output as a fallback, 4306 * and also does not allow output of leading zeros. 4307 */ 4308 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4309 4310 /* get the bytes and the length for the output */ 4311 switch(outputType) { 4312 case MBCS_OUTPUT_2: 4313 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4314 if(value<=0xff) { 4315 length=1; 4316 } else { 4317 length=2; 4318 } 4319 break; 4320 case MBCS_OUTPUT_2_SISO: 4321 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4322 /* 4323 * Save the old state in the converter object 4324 * right here, then change the local prevLength state variable if necessary. 4325 * Then, if this character turns out to be unassigned or a fallback that 4326 * is not taken, the callback code must not save the new state in the converter 4327 * because the new state is for a character that is not output. 4328 * However, the callback must still restore the state from the converter 4329 * in case the callback function changed it for its output. 4330 */ 4331 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4332 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4333 if(value<=0xff) { 4334 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4335 /* no mapping, leave value==0 */ 4336 length=0; 4337 } else if(prevLength<=1) { 4338 length=1; 4339 } else { 4340 /* change from double-byte mode to single-byte */ 4341 if (si_value_length == 1) { 4342 value|=(uint32_t)si_value[0]<<8; 4343 length = 2; 4344 } else if (si_value_length == 2) { 4345 value|=(uint32_t)si_value[1]<<8; 4346 value|=(uint32_t)si_value[0]<<16; 4347 length = 3; 4348 } 4349 prevLength=1; 4350 } 4351 } else { 4352 if(prevLength==2) { 4353 length=2; 4354 } else { 4355 /* change from single-byte mode to double-byte */ 4356 if (so_value_length == 1) { 4357 value|=(uint32_t)so_value[0]<<16; 4358 length = 3; 4359 } else if (so_value_length == 2) { 4360 value|=(uint32_t)so_value[1]<<16; 4361 value|=(uint32_t)so_value[0]<<24; 4362 length = 4; 4363 } 4364 prevLength=2; 4365 } 4366 } 4367 break; 4368 case MBCS_OUTPUT_DBCS_ONLY: 4369 /* table with single-byte results, but only DBCS mappings used */ 4370 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4371 if(value<=0xff) { 4372 /* no mapping or SBCS result, not taken for DBCS-only */ 4373 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4374 length=0; 4375 } else { 4376 length=2; 4377 } 4378 break; 4379 case MBCS_OUTPUT_3: 4380 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4381 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4382 if(value<=0xff) { 4383 length=1; 4384 } else if(value<=0xffff) { 4385 length=2; 4386 } else { 4387 length=3; 4388 } 4389 break; 4390 case MBCS_OUTPUT_4: 4391 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4392 if(value<=0xff) { 4393 length=1; 4394 } else if(value<=0xffff) { 4395 length=2; 4396 } else if(value<=0xffffff) { 4397 length=3; 4398 } else { 4399 length=4; 4400 } 4401 break; 4402 case MBCS_OUTPUT_3_EUC: 4403 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4404 /* EUC 16-bit fixed-length representation */ 4405 if(value<=0xff) { 4406 length=1; 4407 } else if((value&0x8000)==0) { 4408 value|=0x8e8000; 4409 length=3; 4410 } else if((value&0x80)==0) { 4411 value|=0x8f0080; 4412 length=3; 4413 } else { 4414 length=2; 4415 } 4416 break; 4417 case MBCS_OUTPUT_4_EUC: 4418 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4419 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4420 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4421 if(value<=0xff) { 4422 length=1; 4423 } else if(value<=0xffff) { 4424 length=2; 4425 } else if((value&0x800000)==0) { 4426 value|=0x8e800000; 4427 length=4; 4428 } else if((value&0x8000)==0) { 4429 value|=0x8f008000; 4430 length=4; 4431 } else { 4432 length=3; 4433 } 4434 break; 4435 default: 4436 /* must not occur */ 4437 /* 4438 * To avoid compiler warnings that value & length may be 4439 * used without having been initialized, we set them here. 4440 * In reality, this is unreachable code. 4441 * Not having a default branch also causes warnings with 4442 * some compilers. 4443 */ 4444 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4445 length=0; 4446 break; 4447 } 4448 4449 /* is this code point assigned, or do we use fallbacks? */ 4450 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4451 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4452 ) { 4453 /* 4454 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4455 * There is no way with this data structure for fallback output 4456 * to be a zero byte. 4457 */ 4458 4459unassigned: 4460 /* try an extension mapping */ 4461 pArgs->source=source; 4462 c=_extFromU(cnv, cnv->sharedData, 4463 c, &source, sourceLimit, 4464 &target, target+targetCapacity, 4465 &offsets, sourceIndex, 4466 pArgs->flush, 4467 pErrorCode); 4468 nextSourceIndex+=(int32_t)(source-pArgs->source); 4469 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4470 4471 if(U_FAILURE(*pErrorCode)) { 4472 /* not mappable or buffer overflow */ 4473 break; 4474 } else { 4475 /* a mapping was written to the target, continue */ 4476 4477 /* recalculate the targetCapacity after an extension mapping */ 4478 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4479 4480 /* normal end of conversion: prepare for a new character */ 4481 if(offsets!=NULL) { 4482 prevSourceIndex=sourceIndex; 4483 sourceIndex=nextSourceIndex; 4484 } 4485 continue; 4486 } 4487 } 4488 } 4489 4490 /* write the output character bytes from value and length */ 4491 /* from the first if in the loop we know that targetCapacity>0 */ 4492 if(length<=targetCapacity) { 4493 if(offsets==NULL) { 4494 switch(length) { 4495 /* each branch falls through to the next one */ 4496 case 4: 4497 *target++=(uint8_t)(value>>24); 4498 case 3: 4499 *target++=(uint8_t)(value>>16); 4500 case 2: 4501 *target++=(uint8_t)(value>>8); 4502 case 1: 4503 *target++=(uint8_t)value; 4504 default: 4505 /* will never occur */ 4506 break; 4507 } 4508 } else { 4509 switch(length) { 4510 /* each branch falls through to the next one */ 4511 case 4: 4512 *target++=(uint8_t)(value>>24); 4513 *offsets++=sourceIndex; 4514 case 3: 4515 *target++=(uint8_t)(value>>16); 4516 *offsets++=sourceIndex; 4517 case 2: 4518 *target++=(uint8_t)(value>>8); 4519 *offsets++=sourceIndex; 4520 case 1: 4521 *target++=(uint8_t)value; 4522 *offsets++=sourceIndex; 4523 default: 4524 /* will never occur */ 4525 break; 4526 } 4527 } 4528 targetCapacity-=length; 4529 } else { 4530 uint8_t *charErrorBuffer; 4531 4532 /* 4533 * We actually do this backwards here: 4534 * In order to save an intermediate variable, we output 4535 * first to the overflow buffer what does not fit into the 4536 * regular target. 4537 */ 4538 /* we know that 1<=targetCapacity<length<=4 */ 4539 length-=targetCapacity; 4540 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4541 switch(length) { 4542 /* each branch falls through to the next one */ 4543 case 3: 4544 *charErrorBuffer++=(uint8_t)(value>>16); 4545 case 2: 4546 *charErrorBuffer++=(uint8_t)(value>>8); 4547 case 1: 4548 *charErrorBuffer=(uint8_t)value; 4549 default: 4550 /* will never occur */ 4551 break; 4552 } 4553 cnv->charErrorBufferLength=(int8_t)length; 4554 4555 /* now output what fits into the regular target */ 4556 value>>=8*length; /* length was reduced by targetCapacity */ 4557 switch(targetCapacity) { 4558 /* each branch falls through to the next one */ 4559 case 3: 4560 *target++=(uint8_t)(value>>16); 4561 if(offsets!=NULL) { 4562 *offsets++=sourceIndex; 4563 } 4564 case 2: 4565 *target++=(uint8_t)(value>>8); 4566 if(offsets!=NULL) { 4567 *offsets++=sourceIndex; 4568 } 4569 case 1: 4570 *target++=(uint8_t)value; 4571 if(offsets!=NULL) { 4572 *offsets++=sourceIndex; 4573 } 4574 default: 4575 /* will never occur */ 4576 break; 4577 } 4578 4579 /* target overflow */ 4580 targetCapacity=0; 4581 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4582 c=0; 4583 break; 4584 } 4585 4586 /* normal end of conversion: prepare for a new character */ 4587 c=0; 4588 if(offsets!=NULL) { 4589 prevSourceIndex=sourceIndex; 4590 sourceIndex=nextSourceIndex; 4591 } 4592 continue; 4593 } else { 4594 /* target is full */ 4595 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4596 break; 4597 } 4598 } 4599 4600 /* 4601 * the end of the input stream and detection of truncated input 4602 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4603 * we need to emit an SI at the very end 4604 * 4605 * conditions: 4606 * successful 4607 * EBCDIC_STATEFUL in DBCS mode 4608 * end of input and no truncated input 4609 */ 4610 if( U_SUCCESS(*pErrorCode) && 4611 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4612 pArgs->flush && source>=sourceLimit && c==0 4613 ) { 4614 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4615 if(targetCapacity>0) { 4616 *target++=(uint8_t)si_value[0]; 4617 if (si_value_length == 2) { 4618 if (targetCapacity<2) { 4619 cnv->charErrorBuffer[0]=(uint8_t)si_value[1]; 4620 cnv->charErrorBufferLength=1; 4621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4622 } else { 4623 *target++=(uint8_t)si_value[1]; 4624 } 4625 } 4626 if(offsets!=NULL) { 4627 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4628 *offsets++=prevSourceIndex; 4629 } 4630 } else { 4631 /* target is full */ 4632 cnv->charErrorBuffer[0]=(uint8_t)si_value[0]; 4633 if (si_value_length == 2) { 4634 cnv->charErrorBuffer[1]=(uint8_t)si_value[1]; 4635 } 4636 cnv->charErrorBufferLength=si_value_length; 4637 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4638 } 4639 prevLength=1; /* we switched into SBCS */ 4640 } 4641 4642 /* set the converter state back into UConverter */ 4643 cnv->fromUChar32=c; 4644 cnv->fromUnicodeStatus=prevLength; 4645 4646 /* write back the updated pointers */ 4647 pArgs->source=source; 4648 pArgs->target=(char *)target; 4649 pArgs->offsets=offsets; 4650} 4651 4652/* 4653 * This is another simple conversion function for internal use by other 4654 * conversion implementations. 4655 * It does not use the converter state nor call callbacks. 4656 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4657 * It handles conversion extensions but not GB 18030. 4658 * 4659 * It converts one single Unicode code point into codepage bytes, encoded 4660 * as one 32-bit value. The function returns the number of bytes in *pValue: 4661 * 1..4 the number of bytes in *pValue 4662 * 0 unassigned (*pValue undefined) 4663 * -1 illegal (currently not used, *pValue undefined) 4664 * 4665 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4666 * the second to last byte in bits 15..8, etc. 4667 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4668 */ 4669U_CFUNC int32_t 4670ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4671 UChar32 c, uint32_t *pValue, 4672 UBool useFallback) { 4673 const int32_t *cx; 4674 const uint16_t *table; 4675#if 0 4676/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4677 const uint8_t *p; 4678#endif 4679 uint32_t stage2Entry; 4680 uint32_t value; 4681 int32_t length; 4682 4683 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4684 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4685 table=sharedData->mbcs.fromUnicodeTable; 4686 4687 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4688 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4689 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4690 /* is this code point assigned, or do we use fallbacks? */ 4691 if(useFallback ? value>=0x800 : value>=0xc00) { 4692 *pValue=value&0xff; 4693 return 1; 4694 } 4695 } else /* outputType!=MBCS_OUTPUT_1 */ { 4696 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4697 4698 /* get the bytes and the length for the output */ 4699 switch(sharedData->mbcs.outputType) { 4700 case MBCS_OUTPUT_2: 4701 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4702 if(value<=0xff) { 4703 length=1; 4704 } else { 4705 length=2; 4706 } 4707 break; 4708#if 0 4709/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4710 case MBCS_OUTPUT_DBCS_ONLY: 4711 /* table with single-byte results, but only DBCS mappings used */ 4712 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4713 if(value<=0xff) { 4714 /* no mapping or SBCS result, not taken for DBCS-only */ 4715 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4716 length=0; 4717 } else { 4718 length=2; 4719 } 4720 break; 4721 case MBCS_OUTPUT_3: 4722 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4723 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4724 if(value<=0xff) { 4725 length=1; 4726 } else if(value<=0xffff) { 4727 length=2; 4728 } else { 4729 length=3; 4730 } 4731 break; 4732 case MBCS_OUTPUT_4: 4733 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4734 if(value<=0xff) { 4735 length=1; 4736 } else if(value<=0xffff) { 4737 length=2; 4738 } else if(value<=0xffffff) { 4739 length=3; 4740 } else { 4741 length=4; 4742 } 4743 break; 4744 case MBCS_OUTPUT_3_EUC: 4745 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4746 /* EUC 16-bit fixed-length representation */ 4747 if(value<=0xff) { 4748 length=1; 4749 } else if((value&0x8000)==0) { 4750 value|=0x8e8000; 4751 length=3; 4752 } else if((value&0x80)==0) { 4753 value|=0x8f0080; 4754 length=3; 4755 } else { 4756 length=2; 4757 } 4758 break; 4759 case MBCS_OUTPUT_4_EUC: 4760 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4761 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4762 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4763 if(value<=0xff) { 4764 length=1; 4765 } else if(value<=0xffff) { 4766 length=2; 4767 } else if((value&0x800000)==0) { 4768 value|=0x8e800000; 4769 length=4; 4770 } else if((value&0x8000)==0) { 4771 value|=0x8f008000; 4772 length=4; 4773 } else { 4774 length=3; 4775 } 4776 break; 4777#endif 4778 default: 4779 /* must not occur */ 4780 return -1; 4781 } 4782 4783 /* is this code point assigned, or do we use fallbacks? */ 4784 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4785 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4786 ) { 4787 /* 4788 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4789 * There is no way with this data structure for fallback output 4790 * to be a zero byte. 4791 */ 4792 /* assigned */ 4793 *pValue=value; 4794 return length; 4795 } 4796 } 4797 } 4798 4799 cx=sharedData->mbcs.extIndexes; 4800 if(cx!=NULL) { 4801 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4802 return length>=0 ? length : -length; /* return abs(length); */ 4803 } 4804 4805 /* unassigned */ 4806 return 0; 4807} 4808 4809 4810#if 0 4811/* 4812 * This function has been moved to ucnv2022.c for inlining. 4813 * This implementation is here only for documentation purposes 4814 */ 4815 4816/** 4817 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4818 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4819 * It does not handle conversion extensions (_extFromU()). 4820 * 4821 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4822 */ 4823U_CFUNC int32_t 4824ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4825 UChar32 c, 4826 UBool useFallback) { 4827 const uint16_t *table; 4828 int32_t value; 4829 4830 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4831 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4832 return -1; 4833 } 4834 4835 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4836 table=sharedData->mbcs.fromUnicodeTable; 4837 4838 /* get the byte for the output */ 4839 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4840 /* is this code point assigned, or do we use fallbacks? */ 4841 if(useFallback ? value>=0x800 : value>=0xc00) { 4842 return value&0xff; 4843 } else { 4844 return -1; 4845 } 4846} 4847#endif 4848 4849/* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4850 4851/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4852static const UChar32 4853utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4854 4855/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4856static const UChar32 4857utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4858 4859static void 4860ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4861 UConverterToUnicodeArgs *pToUArgs, 4862 UErrorCode *pErrorCode) { 4863 UConverter *utf8, *cnv; 4864 const uint8_t *source, *sourceLimit; 4865 uint8_t *target; 4866 int32_t targetCapacity; 4867 4868 const uint16_t *table, *sbcsIndex; 4869 const uint16_t *results; 4870 4871 int8_t oldToULength, toULength, toULimit; 4872 4873 UChar32 c; 4874 uint8_t b, t1, t2; 4875 4876 uint32_t asciiRoundtrips; 4877 uint16_t value, minValue; 4878 UBool hasSupplementary; 4879 4880 /* set up the local pointers */ 4881 utf8=pToUArgs->converter; 4882 cnv=pFromUArgs->converter; 4883 source=(uint8_t *)pToUArgs->source; 4884 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4885 target=(uint8_t *)pFromUArgs->target; 4886 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4887 4888 table=cnv->sharedData->mbcs.fromUnicodeTable; 4889 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 4890 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4891 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4892 } else { 4893 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4894 } 4895 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4896 4897 if(cnv->useFallback) { 4898 /* use all roundtrip and fallback results */ 4899 minValue=0x800; 4900 } else { 4901 /* use only roundtrips and fallbacks from private-use characters */ 4902 minValue=0xc00; 4903 } 4904 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4905 4906 /* get the converter state from the UTF-8 UConverter */ 4907 c=(UChar32)utf8->toUnicodeStatus; 4908 if(c!=0) { 4909 toULength=oldToULength=utf8->toULength; 4910 toULimit=(int8_t)utf8->mode; 4911 } else { 4912 toULength=oldToULength=toULimit=0; 4913 } 4914 4915 /* 4916 * Make sure that the last byte sequence before sourceLimit is complete 4917 * or runs into a lead byte. 4918 * Do not go back into the bytes that will be read for finishing a partial 4919 * sequence from the previous buffer. 4920 * In the conversion loop compare source with sourceLimit only once 4921 * per multi-byte character. 4922 */ 4923 { 4924 int32_t i, length; 4925 4926 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4927 for(i=0; i<3 && i<length;) { 4928 b=*(sourceLimit-i-1); 4929 if(U8_IS_TRAIL(b)) { 4930 ++i; 4931 } else { 4932 if(i<utf8_countTrailBytes[b]) { 4933 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4934 sourceLimit-=i+1; 4935 } 4936 break; 4937 } 4938 } 4939 } 4940 4941 if(c!=0 && targetCapacity>0) { 4942 utf8->toUnicodeStatus=0; 4943 utf8->toULength=0; 4944 goto moreBytes; 4945 /* 4946 * Note: We could avoid the goto by duplicating some of the moreBytes 4947 * code, but only up to the point of collecting a complete UTF-8 4948 * sequence; then recurse for the toUBytes[toULength] 4949 * and then continue with normal conversion. 4950 * 4951 * If so, move this code to just after initializing the minimum 4952 * set of local variables for reading the UTF-8 input 4953 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 4954 * 4955 * Potential advantages: 4956 * - avoid the goto 4957 * - oldToULength could become a local variable in just those code blocks 4958 * that deal with buffer boundaries 4959 * - possibly faster if the goto prevents some compiler optimizations 4960 * (this would need measuring to confirm) 4961 * Disadvantage: 4962 * - code duplication 4963 */ 4964 } 4965 4966 /* conversion loop */ 4967 while(source<sourceLimit) { 4968 if(targetCapacity>0) { 4969 b=*source++; 4970 if((int8_t)b>=0) { 4971 /* convert ASCII */ 4972 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4973 *target++=(uint8_t)b; 4974 --targetCapacity; 4975 continue; 4976 } else { 4977 c=b; 4978 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 4979 } 4980 } else { 4981 if(b<0xe0) { 4982 if( /* handle U+0080..U+07FF inline */ 4983 b>=0xc2 && 4984 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4985 ) { 4986 c=b&0x1f; 4987 ++source; 4988 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 4989 if(value>=minValue) { 4990 *target++=(uint8_t)value; 4991 --targetCapacity; 4992 continue; 4993 } else { 4994 c=(c<<6)|t1; 4995 } 4996 } else { 4997 c=-1; 4998 } 4999 } else if(b==0xe0) { 5000 if( /* handle U+0800..U+0FFF inline */ 5001 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5002 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5003 ) { 5004 c=t1; 5005 source+=2; 5006 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5007 if(value>=minValue) { 5008 *target++=(uint8_t)value; 5009 --targetCapacity; 5010 continue; 5011 } else { 5012 c=(c<<6)|t2; 5013 } 5014 } else { 5015 c=-1; 5016 } 5017 } else { 5018 c=-1; 5019 } 5020 5021 if(c<0) { 5022 /* handle "complicated" and error cases, and continuing partial characters */ 5023 oldToULength=0; 5024 toULength=1; 5025 toULimit=utf8_countTrailBytes[b]+1; 5026 c=b; 5027moreBytes: 5028 while(toULength<toULimit) { 5029 /* 5030 * The sourceLimit may have been adjusted before the conversion loop 5031 * to stop before a truncated sequence. 5032 * Here we need to use the real limit in case we have two truncated 5033 * sequences at the end. 5034 * See ticket #7492. 5035 */ 5036 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5037 b=*source; 5038 if(U8_IS_TRAIL(b)) { 5039 ++source; 5040 ++toULength; 5041 c=(c<<6)+b; 5042 } else { 5043 break; /* sequence too short, stop with toULength<toULimit */ 5044 } 5045 } else { 5046 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5047 source-=(toULength-oldToULength); 5048 while(oldToULength<toULength) { 5049 utf8->toUBytes[oldToULength++]=*source++; 5050 } 5051 utf8->toUnicodeStatus=c; 5052 utf8->toULength=toULength; 5053 utf8->mode=toULimit; 5054 pToUArgs->source=(char *)source; 5055 pFromUArgs->target=(char *)target; 5056 return; 5057 } 5058 } 5059 5060 if( toULength==toULimit && /* consumed all trail bytes */ 5061 (toULength==3 || toULength==2) && /* BMP */ 5062 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5063 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5064 ) { 5065 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5066 } else if( 5067 toULength==toULimit && toULength==4 && 5068 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5069 ) { 5070 /* supplementary code point */ 5071 if(!hasSupplementary) { 5072 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5073 value=0; 5074 } else { 5075 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5076 } 5077 } else { 5078 /* error handling: illegal UTF-8 byte sequence */ 5079 source-=(toULength-oldToULength); 5080 while(oldToULength<toULength) { 5081 utf8->toUBytes[oldToULength++]=*source++; 5082 } 5083 utf8->toULength=toULength; 5084 pToUArgs->source=(char *)source; 5085 pFromUArgs->target=(char *)target; 5086 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5087 return; 5088 } 5089 } 5090 } 5091 5092 if(value>=minValue) { 5093 /* output the mapping for c */ 5094 *target++=(uint8_t)value; 5095 --targetCapacity; 5096 } else { 5097 /* value<minValue means c is unassigned (unmappable) */ 5098 /* 5099 * Try an extension mapping. 5100 * Pass in no source because we don't have UTF-16 input. 5101 * If we have a partial match on c, we will return and revert 5102 * to UTF-8->UTF-16->charset conversion. 5103 */ 5104 static const UChar nul=0; 5105 const UChar *noSource=&nul; 5106 c=_extFromU(cnv, cnv->sharedData, 5107 c, &noSource, noSource, 5108 &target, target+targetCapacity, 5109 NULL, -1, 5110 pFromUArgs->flush, 5111 pErrorCode); 5112 5113 if(U_FAILURE(*pErrorCode)) { 5114 /* not mappable or buffer overflow */ 5115 cnv->fromUChar32=c; 5116 break; 5117 } else if(cnv->preFromUFirstCP>=0) { 5118 /* 5119 * Partial match, return and revert to pivoting. 5120 * In normal from-UTF-16 conversion, we would just continue 5121 * but then exit the loop because the extension match would 5122 * have consumed the source. 5123 */ 5124 break; 5125 } else { 5126 /* a mapping was written to the target, continue */ 5127 5128 /* recalculate the targetCapacity after an extension mapping */ 5129 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5130 } 5131 } 5132 } else { 5133 /* target is full */ 5134 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5135 break; 5136 } 5137 } 5138 5139 /* 5140 * The sourceLimit may have been adjusted before the conversion loop 5141 * to stop before a truncated sequence. 5142 * If so, then collect the truncated sequence now. 5143 */ 5144 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5145 c=utf8->toUBytes[0]=b=*source++; 5146 toULength=1; 5147 toULimit=utf8_countTrailBytes[b]+1; 5148 while(source<sourceLimit) { 5149 utf8->toUBytes[toULength++]=b=*source++; 5150 c=(c<<6)+b; 5151 } 5152 utf8->toUnicodeStatus=c; 5153 utf8->toULength=toULength; 5154 utf8->mode=toULimit; 5155 } 5156 5157 /* write back the updated pointers */ 5158 pToUArgs->source=(char *)source; 5159 pFromUArgs->target=(char *)target; 5160} 5161 5162static void 5163ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5164 UConverterToUnicodeArgs *pToUArgs, 5165 UErrorCode *pErrorCode) { 5166 UConverter *utf8, *cnv; 5167 const uint8_t *source, *sourceLimit; 5168 uint8_t *target; 5169 int32_t targetCapacity; 5170 5171 const uint16_t *table, *mbcsIndex; 5172 const uint16_t *results; 5173 5174 int8_t oldToULength, toULength, toULimit; 5175 5176 UChar32 c; 5177 uint8_t b, t1, t2; 5178 5179 uint32_t stage2Entry; 5180 uint32_t asciiRoundtrips; 5181 uint16_t value, minValue; 5182 UBool hasSupplementary; 5183 5184 /* set up the local pointers */ 5185 utf8=pToUArgs->converter; 5186 cnv=pFromUArgs->converter; 5187 source=(uint8_t *)pToUArgs->source; 5188 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5189 target=(uint8_t *)pFromUArgs->target; 5190 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5191 5192 table=cnv->sharedData->mbcs.fromUnicodeTable; 5193 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5194 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5195 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5196 } else { 5197 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5198 } 5199 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5200 5201 if(cnv->useFallback) { 5202 /* use all roundtrip and fallback results */ 5203 minValue=0x800; 5204 } else { 5205 /* use only roundtrips and fallbacks from private-use characters */ 5206 minValue=0xc00; 5207 } 5208 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5209 5210 /* get the converter state from the UTF-8 UConverter */ 5211 c=(UChar32)utf8->toUnicodeStatus; 5212 if(c!=0) { 5213 toULength=oldToULength=utf8->toULength; 5214 toULimit=(int8_t)utf8->mode; 5215 } else { 5216 toULength=oldToULength=toULimit=0; 5217 } 5218 5219 /* 5220 * Make sure that the last byte sequence before sourceLimit is complete 5221 * or runs into a lead byte. 5222 * Do not go back into the bytes that will be read for finishing a partial 5223 * sequence from the previous buffer. 5224 * In the conversion loop compare source with sourceLimit only once 5225 * per multi-byte character. 5226 */ 5227 { 5228 int32_t i, length; 5229 5230 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5231 for(i=0; i<3 && i<length;) { 5232 b=*(sourceLimit-i-1); 5233 if(U8_IS_TRAIL(b)) { 5234 ++i; 5235 } else { 5236 if(i<utf8_countTrailBytes[b]) { 5237 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5238 sourceLimit-=i+1; 5239 } 5240 break; 5241 } 5242 } 5243 } 5244 5245 if(c!=0 && targetCapacity>0) { 5246 utf8->toUnicodeStatus=0; 5247 utf8->toULength=0; 5248 goto moreBytes; 5249 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5250 } 5251 5252 /* conversion loop */ 5253 while(source<sourceLimit) { 5254 if(targetCapacity>0) { 5255 b=*source++; 5256 if((int8_t)b>=0) { 5257 /* convert ASCII */ 5258 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5259 *target++=b; 5260 --targetCapacity; 5261 continue; 5262 } else { 5263 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5264 if(value==0) { 5265 c=b; 5266 goto unassigned; 5267 } 5268 } 5269 } else { 5270 if(b>0xe0) { 5271 if( /* handle U+1000..U+D7FF inline */ 5272 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5273 (b==0xed && (t1 <= 0x1f))) && 5274 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5275 ) { 5276 c=((b&0xf)<<6)|t1; 5277 source+=2; 5278 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5279 if(value==0) { 5280 c=(c<<6)|t2; 5281 goto unassigned; 5282 } 5283 } else { 5284 c=-1; 5285 } 5286 } else if(b<0xe0) { 5287 if( /* handle U+0080..U+07FF inline */ 5288 b>=0xc2 && 5289 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5290 ) { 5291 c=b&0x1f; 5292 ++source; 5293 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5294 if(value==0) { 5295 c=(c<<6)|t1; 5296 goto unassigned; 5297 } 5298 } else { 5299 c=-1; 5300 } 5301 } else { 5302 c=-1; 5303 } 5304 5305 if(c<0) { 5306 /* handle "complicated" and error cases, and continuing partial characters */ 5307 oldToULength=0; 5308 toULength=1; 5309 toULimit=utf8_countTrailBytes[b]+1; 5310 c=b; 5311moreBytes: 5312 while(toULength<toULimit) { 5313 /* 5314 * The sourceLimit may have been adjusted before the conversion loop 5315 * to stop before a truncated sequence. 5316 * Here we need to use the real limit in case we have two truncated 5317 * sequences at the end. 5318 * See ticket #7492. 5319 */ 5320 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5321 b=*source; 5322 if(U8_IS_TRAIL(b)) { 5323 ++source; 5324 ++toULength; 5325 c=(c<<6)+b; 5326 } else { 5327 break; /* sequence too short, stop with toULength<toULimit */ 5328 } 5329 } else { 5330 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5331 source-=(toULength-oldToULength); 5332 while(oldToULength<toULength) { 5333 utf8->toUBytes[oldToULength++]=*source++; 5334 } 5335 utf8->toUnicodeStatus=c; 5336 utf8->toULength=toULength; 5337 utf8->mode=toULimit; 5338 pToUArgs->source=(char *)source; 5339 pFromUArgs->target=(char *)target; 5340 return; 5341 } 5342 } 5343 5344 if( toULength==toULimit && /* consumed all trail bytes */ 5345 (toULength==3 || toULength==2) && /* BMP */ 5346 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5347 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5348 ) { 5349 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5350 } else if( 5351 toULength==toULimit && toULength==4 && 5352 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5353 ) { 5354 /* supplementary code point */ 5355 if(!hasSupplementary) { 5356 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5357 stage2Entry=0; 5358 } else { 5359 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5360 } 5361 } else { 5362 /* error handling: illegal UTF-8 byte sequence */ 5363 source-=(toULength-oldToULength); 5364 while(oldToULength<toULength) { 5365 utf8->toUBytes[oldToULength++]=*source++; 5366 } 5367 utf8->toULength=toULength; 5368 pToUArgs->source=(char *)source; 5369 pFromUArgs->target=(char *)target; 5370 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5371 return; 5372 } 5373 5374 /* get the bytes and the length for the output */ 5375 /* MBCS_OUTPUT_2 */ 5376 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5377 5378 /* is this code point assigned, or do we use fallbacks? */ 5379 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5380 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5381 ) { 5382 goto unassigned; 5383 } 5384 } 5385 } 5386 5387 /* write the output character bytes from value and length */ 5388 /* from the first if in the loop we know that targetCapacity>0 */ 5389 if(value<=0xff) { 5390 /* this is easy because we know that there is enough space */ 5391 *target++=(uint8_t)value; 5392 --targetCapacity; 5393 } else /* length==2 */ { 5394 *target++=(uint8_t)(value>>8); 5395 if(2<=targetCapacity) { 5396 *target++=(uint8_t)value; 5397 targetCapacity-=2; 5398 } else { 5399 cnv->charErrorBuffer[0]=(char)value; 5400 cnv->charErrorBufferLength=1; 5401 5402 /* target overflow */ 5403 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5404 break; 5405 } 5406 } 5407 continue; 5408 5409unassigned: 5410 { 5411 /* 5412 * Try an extension mapping. 5413 * Pass in no source because we don't have UTF-16 input. 5414 * If we have a partial match on c, we will return and revert 5415 * to UTF-8->UTF-16->charset conversion. 5416 */ 5417 static const UChar nul=0; 5418 const UChar *noSource=&nul; 5419 c=_extFromU(cnv, cnv->sharedData, 5420 c, &noSource, noSource, 5421 &target, target+targetCapacity, 5422 NULL, -1, 5423 pFromUArgs->flush, 5424 pErrorCode); 5425 5426 if(U_FAILURE(*pErrorCode)) { 5427 /* not mappable or buffer overflow */ 5428 cnv->fromUChar32=c; 5429 break; 5430 } else if(cnv->preFromUFirstCP>=0) { 5431 /* 5432 * Partial match, return and revert to pivoting. 5433 * In normal from-UTF-16 conversion, we would just continue 5434 * but then exit the loop because the extension match would 5435 * have consumed the source. 5436 */ 5437 break; 5438 } else { 5439 /* a mapping was written to the target, continue */ 5440 5441 /* recalculate the targetCapacity after an extension mapping */ 5442 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5443 continue; 5444 } 5445 } 5446 } else { 5447 /* target is full */ 5448 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5449 break; 5450 } 5451 } 5452 5453 /* 5454 * The sourceLimit may have been adjusted before the conversion loop 5455 * to stop before a truncated sequence. 5456 * If so, then collect the truncated sequence now. 5457 */ 5458 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5459 c=utf8->toUBytes[0]=b=*source++; 5460 toULength=1; 5461 toULimit=utf8_countTrailBytes[b]+1; 5462 while(source<sourceLimit) { 5463 utf8->toUBytes[toULength++]=b=*source++; 5464 c=(c<<6)+b; 5465 } 5466 utf8->toUnicodeStatus=c; 5467 utf8->toULength=toULength; 5468 utf8->mode=toULimit; 5469 } 5470 5471 /* write back the updated pointers */ 5472 pToUArgs->source=(char *)source; 5473 pFromUArgs->target=(char *)target; 5474} 5475 5476/* miscellaneous ------------------------------------------------------------ */ 5477 5478static void 5479ucnv_MBCSGetStarters(const UConverter* cnv, 5480 UBool starters[256], 5481 UErrorCode *pErrorCode) { 5482 const int32_t *state0; 5483 int i; 5484 5485 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5486 for(i=0; i<256; ++i) { 5487 /* all bytes that cause a state transition from state 0 are lead bytes */ 5488 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5489 } 5490} 5491 5492/* 5493 * This is an internal function that allows other converter implementations 5494 * to check whether a byte is a lead byte. 5495 */ 5496U_CFUNC UBool 5497ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5498 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5499} 5500 5501static void 5502ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5503 int32_t offsetIndex, 5504 UErrorCode *pErrorCode) { 5505 UConverter *cnv=pArgs->converter; 5506 char *p, *subchar; 5507 char buffer[4]; 5508 int32_t length; 5509 5510 /* first, select between subChar and subChar1 */ 5511 if( cnv->subChar1!=0 && 5512 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5513 cnv->useSubChar1 : 5514 (cnv->invalidUCharBuffer[0]<=0xff)) 5515 ) { 5516 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5517 subchar=(char *)&cnv->subChar1; 5518 length=1; 5519 } else { 5520 /* select subChar in all other cases */ 5521 subchar=(char *)cnv->subChars; 5522 length=cnv->subCharLen; 5523 } 5524 5525 /* reset the selector for the next code point */ 5526 cnv->useSubChar1=FALSE; 5527 5528 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5529 p=buffer; 5530 5531 /* fromUnicodeStatus contains prevLength */ 5532 switch(length) { 5533 case 1: 5534 if(cnv->fromUnicodeStatus==2) { 5535 /* DBCS mode and SBCS sub char: change to SBCS */ 5536 cnv->fromUnicodeStatus=1; 5537 *p++=UCNV_SI; 5538 } 5539 *p++=subchar[0]; 5540 break; 5541 case 2: 5542 if(cnv->fromUnicodeStatus<=1) { 5543 /* SBCS mode and DBCS sub char: change to DBCS */ 5544 cnv->fromUnicodeStatus=2; 5545 *p++=UCNV_SO; 5546 } 5547 *p++=subchar[0]; 5548 *p++=subchar[1]; 5549 break; 5550 default: 5551 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5552 return; 5553 } 5554 subchar=buffer; 5555 length=(int32_t)(p-buffer); 5556 } 5557 5558 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5559} 5560 5561U_CFUNC UConverterType 5562ucnv_MBCSGetType(const UConverter* converter) { 5563 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5564 if(converter->sharedData->mbcs.countStates==1) { 5565 return (UConverterType)UCNV_SBCS; 5566 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5567 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5568 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5569 return (UConverterType)UCNV_DBCS; 5570 } 5571 return (UConverterType)UCNV_MBCS; 5572} 5573 5574static const UConverterImpl _SBCSUTF8Impl={ 5575 UCNV_MBCS, 5576 5577 ucnv_MBCSLoad, 5578 ucnv_MBCSUnload, 5579 5580 ucnv_MBCSOpen, 5581 NULL, 5582 NULL, 5583 5584 ucnv_MBCSToUnicodeWithOffsets, 5585 ucnv_MBCSToUnicodeWithOffsets, 5586 ucnv_MBCSFromUnicodeWithOffsets, 5587 ucnv_MBCSFromUnicodeWithOffsets, 5588 ucnv_MBCSGetNextUChar, 5589 5590 ucnv_MBCSGetStarters, 5591 ucnv_MBCSGetName, 5592 ucnv_MBCSWriteSub, 5593 NULL, 5594 ucnv_MBCSGetUnicodeSet, 5595 5596 NULL, 5597 ucnv_SBCSFromUTF8 5598}; 5599 5600static const UConverterImpl _DBCSUTF8Impl={ 5601 UCNV_MBCS, 5602 5603 ucnv_MBCSLoad, 5604 ucnv_MBCSUnload, 5605 5606 ucnv_MBCSOpen, 5607 NULL, 5608 NULL, 5609 5610 ucnv_MBCSToUnicodeWithOffsets, 5611 ucnv_MBCSToUnicodeWithOffsets, 5612 ucnv_MBCSFromUnicodeWithOffsets, 5613 ucnv_MBCSFromUnicodeWithOffsets, 5614 ucnv_MBCSGetNextUChar, 5615 5616 ucnv_MBCSGetStarters, 5617 ucnv_MBCSGetName, 5618 ucnv_MBCSWriteSub, 5619 NULL, 5620 ucnv_MBCSGetUnicodeSet, 5621 5622 NULL, 5623 ucnv_DBCSFromUTF8 5624}; 5625 5626static const UConverterImpl _MBCSImpl={ 5627 UCNV_MBCS, 5628 5629 ucnv_MBCSLoad, 5630 ucnv_MBCSUnload, 5631 5632 ucnv_MBCSOpen, 5633 NULL, 5634 NULL, 5635 5636 ucnv_MBCSToUnicodeWithOffsets, 5637 ucnv_MBCSToUnicodeWithOffsets, 5638 ucnv_MBCSFromUnicodeWithOffsets, 5639 ucnv_MBCSFromUnicodeWithOffsets, 5640 ucnv_MBCSGetNextUChar, 5641 5642 ucnv_MBCSGetStarters, 5643 ucnv_MBCSGetName, 5644 ucnv_MBCSWriteSub, 5645 NULL, 5646 ucnv_MBCSGetUnicodeSet 5647}; 5648 5649 5650/* Static data is in tools/makeconv/ucnvstat.c for data-based 5651 * converters. Be sure to update it as well. 5652 */ 5653 5654const UConverterSharedData _MBCSData={ 5655 sizeof(UConverterSharedData), 1, 5656 NULL, NULL, NULL, FALSE, &_MBCSImpl, 5657 0 5658}; 5659 5660#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5661