ucnvmbcs.c revision b0ac937921a2c196d8b9da665135bf6ba01a1ccf
1/*
2******************************************************************************
3*
4*   Copyright (C) 2000-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ucnvmbcs.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000jul03
14*   created by: Markus W. Scherer
15*
16*   The current code in this file replaces the previous implementation
17*   of conversion code from multi-byte codepages to Unicode and back.
18*   This implementation supports the following:
19*   - legacy variable-length codepages with up to 4 bytes per character
20*   - all Unicode code points (up to 0x10ffff)
21*   - efficient distinction of unassigned vs. illegal byte sequences
22*   - it is possible in fromUnicode() to directly deal with simple
23*     stateful encodings (used for EBCDIC_STATEFUL)
24*   - it is possible to convert Unicode code points
25*     to a single zero byte (but not as a fallback except for SBCS)
26*
27*   Remaining limitations in fromUnicode:
28*   - byte sequences must not have leading zero bytes
29*   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30*   - limitation to up to 4 bytes per character
31*
32*   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33*   limitations and adds m:n character mappings and other features.
34*   See ucnv_ext.h for details.
35*
36*   Change history:
37*
38*    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39*                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40*                             macros to ucnvmbcs.h file
41*/
42
43#include "unicode/utypes.h"
44
45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
46
47#include "unicode/ucnv.h"
48#include "unicode/ucnv_cb.h"
49#include "unicode/udata.h"
50#include "unicode/uset.h"
51#include "ucnv_bld.h"
52#include "ucnvmbcs.h"
53#include "ucnv_ext.h"
54#include "ucnv_cnv.h"
55#include "umutex.h"
56#include "cmemory.h"
57#include "cstring.h"
58
59/* control optimizations according to the platform */
60#define MBCS_UNROLL_SINGLE_TO_BMP 1
61#define MBCS_UNROLL_SINGLE_FROM_BMP 0
62
63/*
64 * _MBCSHeader versions 5.3 & 4.3
65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
66 *
67 * This version is optional. Version 5 is used for incompatible data format changes.
68 * makeconv will continue to generate version 4 files if possible.
69 *
70 * Changes from version 4:
71 *
72 * The main difference is an additional _MBCSHeader field with
73 * - the length (number of uint32_t) of the _MBCSHeader
74 * - flags for further incompatible data format changes
75 * - flags for further, backward compatible data format changes
76 *
77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
78 * the file and needs to be reconstituted at load time.
79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
81 * (For details about these structures see below, and see ucnvmbcs.h.)
82 *
83 *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
84 *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
85 *   precision markers for all mappings.)
86 *
87 *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
88 *   omitted data that can be reconstituted from the toUnicode data.
89 *
90 *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
91 *   With only roundtrip mappings in the base fromUnicode data, this part is fully
92 *   redundant with the mbcsIndex and will be reconstituted from that (also using the
93 *   stage 1 table which contains the information about how stage 2 was compacted).
94 *
95 *   The rest of the stage 2 table, the part for code points above maxFastUChar,
96 *   is stored in the file and will be appended to the reconstituted part.
97 *
98 *   The entire fromUBytes array is omitted from the file and will be reconstitued.
99 *   This is done by enumerating all toUnicode roundtrip mappings, performing
100 *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
101 *   writing instead of reading the byte values.
102 *
103 * _MBCSHeader version 4.3
104 *
105 * Change from version 4.2:
106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
107 *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
108 *   files which can be used instead of stages 1 & 2.
109 *   Faster lookups for roundtrips from most commonly used characters,
110 *   and lookups from UTF-8 byte sequences with a natural bit distribution.
111 *   See ucnvmbcs.h for more details.
112 *
113 * Change from version 4.1:
114 * - Added an optional extension table structure at the end of the .cnv file.
115 *   It is present if the upper bits of the header flags field contains a non-zero
116 *   byte offset to it.
117 *   Files that contain only a conversion table and no base table
118 *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
119 *   These contain the base table name between the MBCS header and the extension
120 *   data.
121 *
122 * Change from version 4.0:
123 * - Replace header.reserved with header.fromUBytesLength so that all
124 *   fields in the data have length.
125 *
126 * Changes from version 3 (for performance improvements):
127 * - new bit distribution for state table entries
128 * - reordered action codes
129 * - new data structure for single-byte fromUnicode
130 *   + stage 2 only contains indexes
131 *   + stage 3 stores 16 bits per character with classification bits 15..8
132 * - no multiplier for stage 1 entries
133 * - stage 2 for non-single-byte codepages contains the index and the flags in
134 *   one 32-bit value
135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
136 *
137 * For more details about old versions of the MBCS data structure, see
138 * the corresponding versions of this file.
139 *
140 * Converting stateless codepage data ---------------------------------------***
141 * (or codepage data with simple states) to Unicode.
142 *
143 * Data structure and algorithm for converting from complex legacy codepages
144 * to Unicode. (Designed before 2000-may-22.)
145 *
146 * The basic idea is that the structure of legacy codepages can be described
147 * with state tables.
148 * When reading a byte stream, each input byte causes a state transition.
149 * Some transitions result in the output of a code point, some result in
150 * "unassigned" or "illegal" output.
151 * This is used here for character conversion.
152 *
153 * The data structure begins with a state table consisting of a row
154 * per state, with 256 entries (columns) per row for each possible input
155 * byte value.
156 * Each entry is 32 bits wide, with two formats distinguished by
157 * the sign bit (bit 31):
158 *
159 * One format for transitional entries (bit 31 not set) for non-final bytes, and
160 * one format for final entries (bit 31 set).
161 * Both formats contain the number of the next state in the same bit
162 * positions.
163 * State 0 is the initial state.
164 *
165 * Most of the time, the offset values of subsequent states are added
166 * up to a scalar value. This value will eventually be the index of
167 * the Unicode code point in a table that follows the state table.
168 * The effect is that the code points for final state table rows
169 * are contiguous. The code points of final state rows follow each other
170 * in the order of the references to those final states by previous
171 * states, etc.
172 *
173 * For some terminal states, the offset is itself the output Unicode
174 * code point (16 bits for a BMP code point or 20 bits for a supplementary
175 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
176 * For others, the code point in the Unicode table is stored with either
177 * one or two code units: one for BMP code points, two for a pair of
178 * surrogates.
179 * All code points for a final state entry take up the same number of code
180 * units, regardless of whether they all actually _use_ the same number
181 * of code units. This is necessary for simple array access.
182 *
183 * An additional feature comes in with what in ICU is called "fallback"
184 * mappings:
185 *
186 * In addition to round-trippable, precise, 1:1 mappings, there are often
187 * mappings defined between similar, though not the same, characters.
188 * Typically, such mappings occur only in fromUnicode mapping tables because
189 * Unicode has a superset repertoire of most other codepages. However, it
190 * is possible to provide such mappings in the toUnicode tables, too.
191 * In this case, the fallback mappings are partly integrated into the
192 * general state tables because the structure of the encoding includes their
193 * byte sequences.
194 * For final entries in an initial state, fallback mappings are stored in
195 * the entry itself like with roundtrip mappings.
196 * For other final entries, they are stored in the code units table if
197 * the entry is for a pair of code units.
198 * For single-unit results in the code units table, there is no space to
199 * alternatively hold a fallback mapping; in this case, the code unit
200 * is stored as U+fffe (unassigned), and the fallback mapping needs to
201 * be looked up by the scalar offset value in a separate table.
202 *
203 * "Unassigned" state entries really mean "structurally unassigned",
204 * i.e., such a byte sequence will never have a mapping result.
205 *
206 * The interpretation of the bits in each entry is as follows:
207 *
208 * Bit 31 not set, not a terminal entry ("transitional"):
209 * 30..24 next state
210 * 23..0  offset delta, to be added up
211 *
212 * Bit 31 set, terminal ("final") entry:
213 * 30..24 next state (regardless of action code)
214 * 23..20 action code:
215 *        action codes 0 and 1 result in precise-mapping Unicode code points
216 *        0  valid byte sequence
217 *           19..16 not used, 0
218 *           15..0  16-bit Unicode BMP code point
219 *                  never U+fffe or U+ffff
220 *        1  valid byte sequence
221 *           19..0  20-bit Unicode supplementary code point
222 *                  never U+fffe or U+ffff
223 *
224 *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
225 *        2  valid byte sequence (fallback)
226 *           19..16 not used, 0
227 *           15..0  16-bit Unicode BMP code point as fallback result
228 *        3  valid byte sequence (fallback)
229 *           19..0  20-bit Unicode supplementary code point as fallback result
230 *
231 *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
232 *        depending on the code units they result in
233 *        4  valid byte sequence
234 *           19..9  not used, 0
235 *            8..0  final offset delta
236 *                  pointing to one 16-bit code unit which may be
237 *                  fffe  unassigned -- look for a fallback for this offset
238 *                  ffff  illegal
239 *        5  valid byte sequence
240 *           19..9  not used, 0
241 *            8..0  final offset delta
242 *                  pointing to two 16-bit code units
243 *                  (typically UTF-16 surrogates)
244 *                  the result depends on the first code unit as follows:
245 *                  0000..d7ff  roundtrip BMP code point (1st alone)
246 *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
247 *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
248 *                  e000        roundtrip BMP code point (2nd alone)
249 *                  e001        fallback BMP code point (2nd alone)
250 *                  fffe        unassigned
251 *                  ffff        illegal
252 *           (the final offset deltas are at most 255 * 2,
253 *            times 2 because of storing code unit pairs)
254 *
255 *        6  unassigned byte sequence
256 *           19..16 not used, 0
257 *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
258 *                  this does not contain a final offset delta because the main
259 *                  purpose of this action code is to save scalar offset values;
260 *                  therefore, fallback values cannot be assigned to byte
261 *                  sequences that result in this action code
262 *        7  illegal byte sequence
263 *           19..16 not used, 0
264 *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
265 *        8  state change only
266 *           19..0  not used, 0
267 *           useful for state changes in simple stateful encodings,
268 *           at Shift-In/Shift-Out codes
269 *
270 *
271 *        9..15 reserved for future use
272 *           current implementations will only perform a state change
273 *           and ignore bits 19..0
274 *
275 * An encoding with contiguous ranges of unassigned byte sequences, like
276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
277 * at least two states for the trail bytes:
278 * One trail byte state that results in code points, and one that only
279 * has "unassigned" and "illegal" terminal states.
280 *
281 * Note: partly by accident, this data structure supports simple stateful
282 * encodings without any additional logic.
283 * Currently, only simple Shift-In/Shift-Out schemes are handled with
284 * appropriate state tables (especially EBCDIC_STATEFUL!).
285 *
286 * MBCS version 2 added:
287 * unassigned and illegal action codes have U+fffe and U+ffff
288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
289 *
290 * Converting from Unicode to codepage bytes --------------------------------***
291 *
292 * The conversion data structure for fromUnicode is designed for the known
293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
295 * a roundtrip mapping.
296 *
297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
298 * like in the character properties table.
299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
300 * with the resulting bytes is at offsetFromUBytes.
301 *
302 * Beginning with version 4, single-byte codepages have a significantly different
303 * trie compared to other codepages.
304 * In all cases, the entry in stage 1 is directly the index of the block of
305 * 64 entries in stage 2.
306 *
307 * Single-byte lookup:
308 *
309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
310 * Stage 3 contains one 16-bit word per result:
311 * Bits 15..8 indicate the kind of result:
312 *    f  roundtrip result
313 *    c  fallback result from private-use code point
314 *    8  fallback result from other code points
315 *    0  unassigned
316 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
317 *
318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
321 * ASCII code points can be looked up with a linear array access into stage 3.
322 * See maxFastUChar and other details in ucnvmbcs.h.
323 *
324 * Multi-byte lookup:
325 *
326 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
328 *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
329 *             If this test is false, then a non-zero result will be interpreted as
330 *             a fallback mapping.
331 * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
332 *
333 * Stage 3 contains 2, 3, or 4 bytes per result.
334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
335 * while 3 bytes are stored as bytes in big-endian order.
336 * Leading zero bytes are ignored, and the number of bytes is counted.
337 * A zero byte mapping result is possible as a roundtrip result.
338 * For some output types, the actual result is processed from this;
339 * see ucnv_MBCSFromUnicodeWithOffsets().
340 *
341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
343 *
344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
347 * ASCII code points can be looked up with a linear array access into stage 3.
348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
349 *
350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
351 * for compaction.
352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
353 * may overlap by any number of entries.
354 *
355 * MBCS version 2 added:
356 * the converter checks for known output types, which allows
357 * adding new ones without crashing an unaware converter
358 */
359
360static const UConverterImpl _SBCSUTF8Impl;
361static const UConverterImpl _DBCSUTF8Impl;
362
363/* GB 18030 data ------------------------------------------------------------ */
364
365/* helper macros for linear values for GB 18030 four-byte sequences */
366#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
367
368#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
369
370#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
371
372/*
373 * Some ranges of GB 18030 where both the Unicode code points and the
374 * GB four-byte sequences are contiguous and are handled algorithmically by
375 * the special callback functions below.
376 * The values are start & end of Unicode & GB codes.
377 *
378 * Note that single surrogates are not mapped by GB 18030
379 * as of the re-released mapping tables from 2000-nov-30.
380 */
381static const uint32_t
382gb18030Ranges[13][4]={
383    {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
384    {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
385    {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
386    {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
387    {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
388    {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
389    {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
390    {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
391    {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
392    {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
393    {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
394    {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
395    {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
396};
397
398/* bit flag for UConverter.options indicating GB 18030 special handling */
399#define _MBCS_OPTION_GB18030 0x8000
400
401/* Miscellaneous ------------------------------------------------------------ */
402
403/**
404 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
405 * consecutive sequences of bytes, starting from the one encoded in value,
406 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
407 * Does not currently support m:n mappings or reverse fallbacks.
408 * This function will not be called for sequences of bytes with leading zeros.
409 *
410 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
411 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
412 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
413 *        not map to anything
414 * @return TRUE to continue enumeration, FALSE to stop
415 */
416typedef UBool U_CALLCONV
417UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
418
419/* similar to ucnv_MBCSGetNextUChar() but recursive */
420static UBool
421enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
422        int32_t state, uint32_t offset,
423        uint32_t value,
424        UConverterEnumToUCallback *callback, const void *context,
425        UErrorCode *pErrorCode) {
426    UChar32 codePoints[32];
427    const int32_t *row;
428    const uint16_t *unicodeCodeUnits;
429    UChar32 anyCodePoints;
430    int32_t b, limit;
431
432    row=mbcsTable->stateTable[state];
433    unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
434
435    value<<=8;
436    anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
437
438    b=(stateProps[state]&0x38)<<2;
439    if(b==0 && stateProps[state]>=0x40) {
440        /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
441        codePoints[0]=U_SENTINEL;
442        b=1;
443    }
444    limit=((stateProps[state]&7)+1)<<5;
445    while(b<limit) {
446        int32_t entry=row[b];
447        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
448            int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
449            if(stateProps[nextState]>=0) {
450                /* recurse to a state with non-ignorable actions */
451                if(!enumToU(
452                        mbcsTable, stateProps, nextState,
453                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
454                        value|(uint32_t)b,
455                        callback, context,
456                        pErrorCode)) {
457                    return FALSE;
458                }
459            }
460            codePoints[b&0x1f]=U_SENTINEL;
461        } else {
462            UChar32 c;
463            int32_t action;
464
465            /*
466             * An if-else-if chain provides more reliable performance for
467             * the most common cases compared to a switch.
468             */
469            action=MBCS_ENTRY_FINAL_ACTION(entry);
470            if(action==MBCS_STATE_VALID_DIRECT_16) {
471                /* output BMP code point */
472                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
473            } else if(action==MBCS_STATE_VALID_16) {
474                int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
475                c=unicodeCodeUnits[finalOffset];
476                if(c<0xfffe) {
477                    /* output BMP code point */
478                } else {
479                    c=U_SENTINEL;
480                }
481            } else if(action==MBCS_STATE_VALID_16_PAIR) {
482                int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
483                c=unicodeCodeUnits[finalOffset++];
484                if(c<0xd800) {
485                    /* output BMP code point below 0xd800 */
486                } else if(c<=0xdbff) {
487                    /* output roundtrip or fallback supplementary code point */
488                    c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
489                } else if(c==0xe000) {
490                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
491                    c=unicodeCodeUnits[finalOffset];
492                } else {
493                    c=U_SENTINEL;
494                }
495            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
496                /* output supplementary code point */
497                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
498            } else {
499                c=U_SENTINEL;
500            }
501
502            codePoints[b&0x1f]=c;
503            anyCodePoints&=c;
504        }
505        if(((++b)&0x1f)==0) {
506            if(anyCodePoints>=0) {
507                if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
508                    return FALSE;
509                }
510                anyCodePoints=-1;
511            }
512        }
513    }
514    return TRUE;
515}
516
517/*
518 * Only called if stateProps[state]==-1.
519 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
520 * MBCS_STATE_CHANGE_ONLY.
521 */
522static int8_t
523getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
524    const int32_t *row;
525    int32_t min, max, entry, nextState;
526
527    row=stateTable[state];
528    stateProps[state]=0;
529
530    /* find first non-ignorable state */
531    for(min=0;; ++min) {
532        entry=row[min];
533        nextState=MBCS_ENTRY_STATE(entry);
534        if(stateProps[nextState]==-1) {
535            getStateProp(stateTable, stateProps, nextState);
536        }
537        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
538            if(stateProps[nextState]>=0) {
539                break;
540            }
541        } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
542            break;
543        }
544        if(min==0xff) {
545            stateProps[state]=-0x40;  /* (int8_t)0xc0 */
546            return stateProps[state];
547        }
548    }
549    stateProps[state]|=(int8_t)((min>>5)<<3);
550
551    /* find last non-ignorable state */
552    for(max=0xff; min<max; --max) {
553        entry=row[max];
554        nextState=MBCS_ENTRY_STATE(entry);
555        if(stateProps[nextState]==-1) {
556            getStateProp(stateTable, stateProps, nextState);
557        }
558        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
559            if(stateProps[nextState]>=0) {
560                break;
561            }
562        } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
563            break;
564        }
565    }
566    stateProps[state]|=(int8_t)(max>>5);
567
568    /* recurse further and collect direct-state information */
569    while(min<=max) {
570        entry=row[min];
571        nextState=MBCS_ENTRY_STATE(entry);
572        if(stateProps[nextState]==-1) {
573            getStateProp(stateTable, stateProps, nextState);
574        }
575        if(MBCS_ENTRY_IS_FINAL(entry)) {
576            stateProps[nextState]|=0x40;
577            if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
578                stateProps[state]|=0x40;
579            }
580        }
581        ++min;
582    }
583    return stateProps[state];
584}
585
586/*
587 * Internal function enumerating the toUnicode data of an MBCS converter.
588 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
589 * table, but could also be used for a future ucnv_getUnicodeSet() option
590 * that includes reverse fallbacks (after updating this function's implementation).
591 * Currently only handles roundtrip mappings.
592 * Does not currently handle extensions.
593 */
594static void
595ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
596                       UConverterEnumToUCallback *callback, const void *context,
597                       UErrorCode *pErrorCode) {
598    /*
599     * Properties for each state, to speed up the enumeration.
600     * Ignorable actions are unassigned/illegal/state-change-only:
601     * They do not lead to mappings.
602     *
603     * Bits 7..6:
604     * 1 direct/initial state (stateful converters have multiple)
605     * 0 non-initial state with transitions or with non-ignorable result actions
606     * -1 final state with only ignorable actions
607     *
608     * Bits 5..3:
609     * The lowest byte value with non-ignorable actions is
610     * value<<5 (rounded down).
611     *
612     * Bits 2..0:
613     * The highest byte value with non-ignorable actions is
614     * (value<<5)&0x1f (rounded up).
615     */
616    int8_t stateProps[MBCS_MAX_STATE_COUNT];
617    int32_t state;
618
619    uprv_memset(stateProps, -1, sizeof(stateProps));
620
621    /* recurse from state 0 and set all stateProps */
622    getStateProp(mbcsTable->stateTable, stateProps, 0);
623
624    for(state=0; state<mbcsTable->countStates; ++state) {
625        /*if(stateProps[state]==-1) {
626            printf("unused/unreachable <icu:state> %d\n", state);
627        }*/
628        if(stateProps[state]>=0x40) {
629            /* start from each direct state */
630            enumToU(
631                mbcsTable, stateProps, state, 0, 0,
632                callback, context,
633                pErrorCode);
634        }
635    }
636}
637
638U_CFUNC void
639ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
640                                         const USetAdder *sa,
641                                         UConverterUnicodeSet which,
642                                         UConverterSetFilter filter,
643                                         UErrorCode *pErrorCode) {
644    const UConverterMBCSTable *mbcsTable;
645    const uint16_t *table;
646
647    uint32_t st3;
648    uint16_t st1, maxStage1, st2;
649
650    UChar32 c;
651
652    /* enumerate the from-Unicode trie table */
653    mbcsTable=&sharedData->mbcs;
654    table=mbcsTable->fromUnicodeTable;
655    if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
656        maxStage1=0x440;
657    } else {
658        maxStage1=0x40;
659    }
660
661    c=0; /* keep track of the current code point while enumerating */
662
663    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
664        const uint16_t *stage2, *stage3, *results;
665        uint16_t minValue;
666
667        results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
668
669        /*
670         * Set a threshold variable for selecting which mappings to use.
671         * See ucnv_MBCSSingleFromBMPWithOffsets() and
672         * MBCS_SINGLE_RESULT_FROM_U() for details.
673         */
674        if(which==UCNV_ROUNDTRIP_SET) {
675            /* use only roundtrips */
676            minValue=0xf00;
677        } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
678            /* use all roundtrip and fallback results */
679            minValue=0x800;
680        }
681
682        for(st1=0; st1<maxStage1; ++st1) {
683            st2=table[st1];
684            if(st2>maxStage1) {
685                stage2=table+st2;
686                for(st2=0; st2<64; ++st2) {
687                    if((st3=stage2[st2])!=0) {
688                        /* read the stage 3 block */
689                        stage3=results+st3;
690
691                        do {
692                            if(*stage3++>=minValue) {
693                                sa->add(sa->set, c);
694                            }
695                        } while((++c&0xf)!=0);
696                    } else {
697                        c+=16; /* empty stage 3 block */
698                    }
699                }
700            } else {
701                c+=1024; /* empty stage 2 block */
702            }
703        }
704    } else {
705        const uint32_t *stage2;
706        const uint8_t *stage3, *bytes;
707        uint32_t st3Multiplier;
708        uint32_t value;
709        UBool useFallback;
710
711        bytes=mbcsTable->fromUnicodeBytes;
712
713        useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
714
715        switch(mbcsTable->outputType) {
716        case MBCS_OUTPUT_3:
717        case MBCS_OUTPUT_4_EUC:
718            st3Multiplier=3;
719            break;
720        case MBCS_OUTPUT_4:
721            st3Multiplier=4;
722            break;
723        default:
724            st3Multiplier=2;
725            break;
726        }
727
728        for(st1=0; st1<maxStage1; ++st1) {
729            st2=table[st1];
730            if(st2>(maxStage1>>1)) {
731                stage2=(const uint32_t *)table+st2;
732                for(st2=0; st2<64; ++st2) {
733                    if((st3=stage2[st2])!=0) {
734                        /* read the stage 3 block */
735                        stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
736
737                        /* get the roundtrip flags for the stage 3 block */
738                        st3>>=16;
739
740                        /*
741                         * Add code points for which the roundtrip flag is set,
742                         * or which map to non-zero bytes if we use fallbacks.
743                         * See ucnv_MBCSFromUnicodeWithOffsets() for details.
744                         */
745                        switch(filter) {
746                        case UCNV_SET_FILTER_NONE:
747                            do {
748                                if(st3&1) {
749                                    sa->add(sa->set, c);
750                                    stage3+=st3Multiplier;
751                                } else if(useFallback) {
752                                    uint8_t b=0;
753                                    switch(st3Multiplier) {
754                                    case 4:
755                                        b|=*stage3++;
756                                    case 3:
757                                        b|=*stage3++;
758                                    case 2:
759                                        b|=stage3[0]|stage3[1];
760                                        stage3+=2;
761                                    default:
762                                        break;
763                                    }
764                                    if(b!=0) {
765                                        sa->add(sa->set, c);
766                                    }
767                                }
768                                st3>>=1;
769                            } while((++c&0xf)!=0);
770                            break;
771                        case UCNV_SET_FILTER_DBCS_ONLY:
772                             /* Ignore single-byte results (<0x100). */
773                            do {
774                                if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
775                                    sa->add(sa->set, c);
776                                }
777                                st3>>=1;
778                                stage3+=2;  /* +=st3Multiplier */
779                            } while((++c&0xf)!=0);
780                            break;
781                        case UCNV_SET_FILTER_2022_CN:
782                             /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
783                            do {
784                                if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
785                                    sa->add(sa->set, c);
786                                }
787                                st3>>=1;
788                                stage3+=3;  /* +=st3Multiplier */
789                            } while((++c&0xf)!=0);
790                            break;
791                        case UCNV_SET_FILTER_SJIS:
792                             /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
793                            do {
794                                if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
795                                    sa->add(sa->set, c);
796                                }
797                                st3>>=1;
798                                stage3+=2;  /* +=st3Multiplier */
799                            } while((++c&0xf)!=0);
800                            break;
801                        case UCNV_SET_FILTER_GR94DBCS:
802                            /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
803                            do {
804                                if( ((st3&1)!=0 || useFallback) &&
805                                    (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
806                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
807                                ) {
808                                    sa->add(sa->set, c);
809                                }
810                                st3>>=1;
811                                stage3+=2;  /* +=st3Multiplier */
812                            } while((++c&0xf)!=0);
813                            break;
814                        case UCNV_SET_FILTER_HZ:
815                            /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
816                            do {
817                                if( ((st3&1)!=0 || useFallback) &&
818                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
819                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
820                                ) {
821                                    sa->add(sa->set, c);
822                                }
823                                st3>>=1;
824                                stage3+=2;  /* +=st3Multiplier */
825                            } while((++c&0xf)!=0);
826                            break;
827                        default:
828                            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
829                            return;
830                        }
831                    } else {
832                        c+=16; /* empty stage 3 block */
833                    }
834                }
835            } else {
836                c+=1024; /* empty stage 2 block */
837            }
838        }
839    }
840
841    ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
842}
843
844U_CFUNC void
845ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
846                                 const USetAdder *sa,
847                                 UConverterUnicodeSet which,
848                                 UErrorCode *pErrorCode) {
849    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
850        sharedData, sa, which,
851        sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
852            UCNV_SET_FILTER_DBCS_ONLY :
853            UCNV_SET_FILTER_NONE,
854        pErrorCode);
855}
856
857static void
858ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
859                   const USetAdder *sa,
860                   UConverterUnicodeSet which,
861                   UErrorCode *pErrorCode) {
862    if(cnv->options&_MBCS_OPTION_GB18030) {
863        sa->addRange(sa->set, 0, 0xd7ff);
864        sa->addRange(sa->set, 0xe000, 0x10ffff);
865    } else {
866        ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
867    }
868}
869
870/* conversion extensions for input not in the main table -------------------- */
871
872/*
873 * Hardcoded extension handling for GB 18030.
874 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
875 *
876 * In the future, conversion extensions may handle m:n mappings and delta tables,
877 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
878 *
879 * If an input character cannot be mapped, then these functions set an error
880 * code. The framework will then call the callback function.
881 */
882
883/*
884 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
885 *         else return 0 after output has been written to the target
886 */
887static UChar32
888_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
889          UChar32 cp,
890          const UChar **source, const UChar *sourceLimit,
891          uint8_t **target, const uint8_t *targetLimit,
892          int32_t **offsets, int32_t sourceIndex,
893          UBool flush,
894          UErrorCode *pErrorCode) {
895    const int32_t *cx;
896
897    cnv->useSubChar1=FALSE;
898
899    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
900        ucnv_extInitialMatchFromU(
901            cnv, cx,
902            cp, source, sourceLimit,
903            (char **)target, (char *)targetLimit,
904            offsets, sourceIndex,
905            flush,
906            pErrorCode)
907    ) {
908        return 0; /* an extension mapping handled the input */
909    }
910
911    /* GB 18030 */
912    if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
913        const uint32_t *range;
914        int32_t i;
915
916        range=gb18030Ranges[0];
917        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
918            if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
919                /* found the Unicode code point, output the four-byte sequence for it */
920                uint32_t linear;
921                char bytes[4];
922
923                /* get the linear value of the first GB 18030 code in this range */
924                linear=range[2]-LINEAR_18030_BASE;
925
926                /* add the offset from the beginning of the range */
927                linear+=((uint32_t)cp-range[0]);
928
929                /* turn this into a four-byte sequence */
930                bytes[3]=(char)(0x30+linear%10); linear/=10;
931                bytes[2]=(char)(0x81+linear%126); linear/=126;
932                bytes[1]=(char)(0x30+linear%10); linear/=10;
933                bytes[0]=(char)(0x81+linear);
934
935                /* output this sequence */
936                ucnv_fromUWriteBytes(cnv,
937                                     bytes, 4, (char **)target, (char *)targetLimit,
938                                     offsets, sourceIndex, pErrorCode);
939                return 0;
940            }
941        }
942    }
943
944    /* no mapping */
945    *pErrorCode=U_INVALID_CHAR_FOUND;
946    return cp;
947}
948
949/*
950 * Input sequence: cnv->toUBytes[0..length[
951 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
952 *         else return 0 after output has been written to the target
953 */
954static int8_t
955_extToU(UConverter *cnv, const UConverterSharedData *sharedData,
956        int8_t length,
957        const uint8_t **source, const uint8_t *sourceLimit,
958        UChar **target, const UChar *targetLimit,
959        int32_t **offsets, int32_t sourceIndex,
960        UBool flush,
961        UErrorCode *pErrorCode) {
962    const int32_t *cx;
963
964    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
965        ucnv_extInitialMatchToU(
966            cnv, cx,
967            length, (const char **)source, (const char *)sourceLimit,
968            target, targetLimit,
969            offsets, sourceIndex,
970            flush,
971            pErrorCode)
972    ) {
973        return 0; /* an extension mapping handled the input */
974    }
975
976    /* GB 18030 */
977    if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
978        const uint32_t *range;
979        uint32_t linear;
980        int32_t i;
981
982        linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
983        range=gb18030Ranges[0];
984        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
985            if(range[2]<=linear && linear<=range[3]) {
986                /* found the sequence, output the Unicode code point for it */
987                *pErrorCode=U_ZERO_ERROR;
988
989                /* add the linear difference between the input and start sequences to the start code point */
990                linear=range[0]+(linear-range[2]);
991
992                /* output this code point */
993                ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
994
995                return 0;
996            }
997        }
998    }
999
1000    /* no mapping */
1001    *pErrorCode=U_INVALID_CHAR_FOUND;
1002    return length;
1003}
1004
1005/* EBCDIC swap LF<->NL ------------------------------------------------------ */
1006
1007/*
1008 * This code modifies a standard EBCDIC<->Unicode mapping table for
1009 * OS/390 (z/OS) Unix System Services (Open Edition).
1010 * The difference is in the mapping of Line Feed and New Line control codes:
1011 * Standard EBCDIC maps
1012 *
1013 *   <U000A> \x25 |0
1014 *   <U0085> \x15 |0
1015 *
1016 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1017 * mapping
1018 *
1019 *   <U000A> \x15 |0
1020 *   <U0085> \x25 |0
1021 *
1022 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1023 * by copying it into allocated memory and swapping the LF and NL values.
1024 * It allows to support the same EBCDIC charset in both versions without
1025 * duplicating the entire installed table.
1026 */
1027
1028/* standard EBCDIC codes */
1029#define EBCDIC_LF 0x25
1030#define EBCDIC_NL 0x15
1031
1032/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1033#define EBCDIC_RT_LF 0xf25
1034#define EBCDIC_RT_NL 0xf15
1035
1036/* Unicode code points */
1037#define U_LF 0x0a
1038#define U_NL 0x85
1039
1040static UBool
1041_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
1042    UConverterMBCSTable *mbcsTable;
1043
1044    const uint16_t *table, *results;
1045    const uint8_t *bytes;
1046
1047    int32_t (*newStateTable)[256];
1048    uint16_t *newResults;
1049    uint8_t *p;
1050    char *name;
1051
1052    uint32_t stage2Entry;
1053    uint32_t size, sizeofFromUBytes;
1054
1055    mbcsTable=&sharedData->mbcs;
1056
1057    table=mbcsTable->fromUnicodeTable;
1058    bytes=mbcsTable->fromUnicodeBytes;
1059    results=(const uint16_t *)bytes;
1060
1061    /*
1062     * Check that this is an EBCDIC table with SBCS portion -
1063     * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1064     *
1065     * If not, ignore the option. Options are always ignored if they do not apply.
1066     */
1067    if(!(
1068         (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1069         mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1070         mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
1071    )) {
1072        return FALSE;
1073    }
1074
1075    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1076        if(!(
1077             EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1078             EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1079        )) {
1080            return FALSE;
1081        }
1082    } else /* MBCS_OUTPUT_2_SISO */ {
1083        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1084        if(!(
1085             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1086             EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1087        )) {
1088            return FALSE;
1089        }
1090
1091        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1092        if(!(
1093             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1094             EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1095        )) {
1096            return FALSE;
1097        }
1098    }
1099
1100    if(mbcsTable->fromUBytesLength>0) {
1101        /*
1102         * We _know_ the number of bytes in the fromUnicodeBytes array
1103         * starting with header.version 4.1.
1104         */
1105        sizeofFromUBytes=mbcsTable->fromUBytesLength;
1106    } else {
1107        /*
1108         * Otherwise:
1109         * There used to be code to enumerate the fromUnicode
1110         * trie and find the highest entry, but it was removed in ICU 3.2
1111         * because it was not tested and caused a low code coverage number.
1112         * See Jitterbug 3674.
1113         * This affects only some .cnv file formats with a header.version
1114         * below 4.1, and only when swaplfnl is requested.
1115         *
1116         * ucnvmbcs.c revision 1.99 is the last one with the
1117         * ucnv_MBCSSizeofFromUBytes() function.
1118         */
1119        *pErrorCode=U_INVALID_FORMAT_ERROR;
1120        return FALSE;
1121    }
1122
1123    /*
1124     * The table has an appropriate format.
1125     * Allocate and build
1126     * - a modified to-Unicode state table
1127     * - a modified from-Unicode output array
1128     * - a converter name string with the swap option appended
1129     */
1130    size=
1131        mbcsTable->countStates*1024+
1132        sizeofFromUBytes+
1133        UCNV_MAX_CONVERTER_NAME_LENGTH+20;
1134    p=(uint8_t *)uprv_malloc(size);
1135    if(p==NULL) {
1136        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1137        return FALSE;
1138    }
1139
1140    /* copy and modify the to-Unicode state table */
1141    newStateTable=(int32_t (*)[256])p;
1142    uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1143
1144    newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1145    newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1146
1147    /* copy and modify the from-Unicode result table */
1148    newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1149    uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1150
1151    /* conveniently, the table access macros work on the left side of expressions */
1152    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1153        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1154        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1155    } else /* MBCS_OUTPUT_2_SISO */ {
1156        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1157        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1158
1159        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1160        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1161    }
1162
1163    /* set the canonical converter name */
1164    name=(char *)newResults+sizeofFromUBytes;
1165    uprv_strcpy(name, sharedData->staticData->name);
1166    uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1167
1168    /* set the pointers */
1169    umtx_lock(NULL);
1170    if(mbcsTable->swapLFNLStateTable==NULL) {
1171        mbcsTable->swapLFNLStateTable=newStateTable;
1172        mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1173        mbcsTable->swapLFNLName=name;
1174
1175        newStateTable=NULL;
1176    }
1177    umtx_unlock(NULL);
1178
1179    /* release the allocated memory if another thread beat us to it */
1180    if(newStateTable!=NULL) {
1181        uprv_free(newStateTable);
1182    }
1183    return TRUE;
1184}
1185
1186/* reconstitute omitted fromUnicode data ------------------------------------ */
1187
1188/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1189static UBool U_CALLCONV
1190writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1191    UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
1192    const uint16_t *table;
1193    uint32_t *stage2;
1194    uint8_t *bytes, *p;
1195    UChar32 c;
1196    int32_t i, st3;
1197
1198    table=mbcsTable->fromUnicodeTable;
1199    bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1200
1201    /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1202    switch(mbcsTable->outputType) {
1203    case MBCS_OUTPUT_3_EUC:
1204        if(value<=0xffff) {
1205            /* short sequences are stored directly */
1206            /* code set 0 or 1 */
1207        } else if(value<=0x8effff) {
1208            /* code set 2 */
1209            value&=0x7fff;
1210        } else /* first byte is 0x8f */ {
1211            /* code set 3 */
1212            value&=0xff7f;
1213        }
1214        break;
1215    case MBCS_OUTPUT_4_EUC:
1216        if(value<=0xffffff) {
1217            /* short sequences are stored directly */
1218            /* code set 0 or 1 */
1219        } else if(value<=0x8effffff) {
1220            /* code set 2 */
1221            value&=0x7fffff;
1222        } else /* first byte is 0x8f */ {
1223            /* code set 3 */
1224            value&=0xff7fff;
1225        }
1226        break;
1227    default:
1228        break;
1229    }
1230
1231    for(i=0; i<=0x1f; ++value, ++i) {
1232        c=codePoints[i];
1233        if(c<0) {
1234            continue;
1235        }
1236
1237        /* locate the stage 2 & 3 data */
1238        stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1239        p=bytes;
1240        st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1241
1242        /* write the codepage bytes into stage 3 */
1243        switch(mbcsTable->outputType) {
1244        case MBCS_OUTPUT_3:
1245        case MBCS_OUTPUT_4_EUC:
1246            p+=st3*3;
1247            p[0]=(uint8_t)(value>>16);
1248            p[1]=(uint8_t)(value>>8);
1249            p[2]=(uint8_t)value;
1250            break;
1251        case MBCS_OUTPUT_4:
1252            ((uint32_t *)p)[st3]=value;
1253            break;
1254        default:
1255            /* 2 bytes per character */
1256            ((uint16_t *)p)[st3]=(uint16_t)value;
1257            break;
1258        }
1259
1260        /* set the roundtrip flag */
1261        *stage2|=(1UL<<(16+(c&0xf)));
1262    }
1263    return TRUE;
1264 }
1265
1266static void
1267reconstituteData(UConverterMBCSTable *mbcsTable,
1268                 uint32_t stage1Length, uint32_t stage2Length,
1269                 uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
1270                 UErrorCode *pErrorCode) {
1271    uint16_t *stage1;
1272    uint32_t *stage2;
1273    uint8_t *bytes;
1274    uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1275    mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1276    if(mbcsTable->reconstitutedData==NULL) {
1277        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1278        return;
1279    }
1280    uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1281
1282    /* copy existing data and reroute the pointers */
1283    stage1=(uint16_t *)mbcsTable->reconstitutedData;
1284    uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1285
1286    stage2=(uint32_t *)(stage1+stage1Length);
1287    uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1288                mbcsTable->fromUnicodeTable+stage1Length,
1289                stage2Length*4);
1290
1291    mbcsTable->fromUnicodeTable=stage1;
1292    mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
1293
1294    /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1295    stage2=(uint32_t *)stage1;
1296
1297    /* reconstitute the initial part of stage 2 from the mbcsIndex */
1298    {
1299        int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1300        int32_t stageUTF8Index=0;
1301        int32_t st1, st2, st3, i;
1302
1303        for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1304            st2=stage1[st1];
1305            if(st2!=stage1Length/2) {
1306                /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1307                for(i=0; i<16; ++i) {
1308                    st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1309                    if(st3!=0) {
1310                        /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1311                        st3>>=4;
1312                        /*
1313                         * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1314                         * allocated together as a single 64-block for access from the mbcsIndex
1315                         */
1316                        stage2[st2++]=st3++;
1317                        stage2[st2++]=st3++;
1318                        stage2[st2++]=st3++;
1319                        stage2[st2++]=st3;
1320                    } else {
1321                        /* no stage 3 block, skip */
1322                        st2+=4;
1323                    }
1324                }
1325            } else {
1326                /* no stage 2 block, skip */
1327                stageUTF8Index+=16;
1328            }
1329        }
1330    }
1331
1332    /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1333    ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1334}
1335
1336/* MBCS setup functions ----------------------------------------------------- */
1337
1338static void
1339ucnv_MBCSLoad(UConverterSharedData *sharedData,
1340          UConverterLoadArgs *pArgs,
1341          const uint8_t *raw,
1342          UErrorCode *pErrorCode) {
1343    UDataInfo info;
1344    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1345    _MBCSHeader *header=(_MBCSHeader *)raw;
1346    uint32_t offset;
1347    uint32_t headerLength;
1348    UBool noFromU=FALSE;
1349
1350    if(header->version[0]==4) {
1351        headerLength=MBCS_HEADER_V4_LENGTH;
1352    } else if(header->version[0]==5 && header->version[1]>=3 &&
1353              (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
1354        headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1355        noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1356    } else {
1357        *pErrorCode=U_INVALID_TABLE_FORMAT;
1358        return;
1359    }
1360
1361    mbcsTable->outputType=(uint8_t)header->flags;
1362    if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1363        *pErrorCode=U_INVALID_TABLE_FORMAT;
1364        return;
1365    }
1366
1367    /* extension data, header version 4.2 and higher */
1368    offset=header->flags>>8;
1369    if(offset!=0) {
1370        mbcsTable->extIndexes=(const int32_t *)(raw+offset);
1371    }
1372
1373    if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1374        UConverterLoadArgs args={ 0 };
1375        UConverterSharedData *baseSharedData;
1376        const int32_t *extIndexes;
1377        const char *baseName;
1378
1379        /* extension-only file, load the base table and set values appropriately */
1380        if((extIndexes=mbcsTable->extIndexes)==NULL) {
1381            /* extension-only file without extension */
1382            *pErrorCode=U_INVALID_TABLE_FORMAT;
1383            return;
1384        }
1385
1386        if(pArgs->nestedLoads!=1) {
1387            /* an extension table must not be loaded as a base table */
1388            *pErrorCode=U_INVALID_TABLE_FILE;
1389            return;
1390        }
1391
1392        /* load the base table */
1393        baseName=(const char *)header+headerLength*4;
1394        if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1395            /* forbid loading this same extension-only file */
1396            *pErrorCode=U_INVALID_TABLE_FORMAT;
1397            return;
1398        }
1399
1400        /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1401        args.size=sizeof(UConverterLoadArgs);
1402        args.nestedLoads=2;
1403        args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
1404        args.reserved=pArgs->reserved;
1405        args.options=pArgs->options;
1406        args.pkg=pArgs->pkg;
1407        args.name=baseName;
1408        baseSharedData=ucnv_load(&args, pErrorCode);
1409        if(U_FAILURE(*pErrorCode)) {
1410            return;
1411        }
1412        if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1413            baseSharedData->mbcs.baseSharedData!=NULL
1414        ) {
1415            ucnv_unload(baseSharedData);
1416            *pErrorCode=U_INVALID_TABLE_FORMAT;
1417            return;
1418        }
1419        if(pArgs->onlyTestIsLoadable) {
1420            /*
1421             * Exit as soon as we know that we can load the converter
1422             * and the format is valid and supported.
1423             * The worst that can happen in the following code is a memory
1424             * allocation error.
1425             */
1426            ucnv_unload(baseSharedData);
1427            return;
1428        }
1429
1430        /* copy the base table data */
1431        uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1432
1433        /* overwrite values with relevant ones for the extension converter */
1434        mbcsTable->baseSharedData=baseSharedData;
1435        mbcsTable->extIndexes=extIndexes;
1436
1437        /*
1438         * It would be possible to share the swapLFNL data with a base converter,
1439         * but the generated name would have to be different, and the memory
1440         * would have to be free'd only once.
1441         * It is easier to just create the data for the extension converter
1442         * separately when it is requested.
1443         */
1444        mbcsTable->swapLFNLStateTable=NULL;
1445        mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1446        mbcsTable->swapLFNLName=NULL;
1447
1448        /*
1449         * The reconstitutedData must be deleted only when the base converter
1450         * is unloaded.
1451         */
1452        mbcsTable->reconstitutedData=NULL;
1453
1454        /*
1455         * Set a special, runtime-only outputType if the extension converter
1456         * is a DBCS version of a base converter that also maps single bytes.
1457         */
1458        if( sharedData->staticData->conversionType==UCNV_DBCS ||
1459                (sharedData->staticData->conversionType==UCNV_MBCS &&
1460                 sharedData->staticData->minBytesPerChar>=2)
1461        ) {
1462            if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1463                /* the base converter is SI/SO-stateful */
1464                int32_t entry;
1465
1466                /* get the dbcs state from the state table entry for SO=0x0e */
1467                entry=mbcsTable->stateTable[0][0xe];
1468                if( MBCS_ENTRY_IS_FINAL(entry) &&
1469                    MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1470                    MBCS_ENTRY_FINAL_STATE(entry)!=0
1471                ) {
1472                    mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1473
1474                    mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1475                }
1476            } else if(
1477                baseSharedData->staticData->conversionType==UCNV_MBCS &&
1478                baseSharedData->staticData->minBytesPerChar==1 &&
1479                baseSharedData->staticData->maxBytesPerChar==2 &&
1480                mbcsTable->countStates<=127
1481            ) {
1482                /* non-stateful base converter, need to modify the state table */
1483                int32_t (*newStateTable)[256];
1484                int32_t *state;
1485                int32_t i, count;
1486
1487                /* allocate a new state table and copy the base state table contents */
1488                count=mbcsTable->countStates;
1489                newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1490                if(newStateTable==NULL) {
1491                    ucnv_unload(baseSharedData);
1492                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1493                    return;
1494                }
1495
1496                uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1497
1498                /* change all final single-byte entries to go to a new all-illegal state */
1499                state=newStateTable[0];
1500                for(i=0; i<256; ++i) {
1501                    if(MBCS_ENTRY_IS_FINAL(state[i])) {
1502                        state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1503                    }
1504                }
1505
1506                /* build the new all-illegal state */
1507                state=newStateTable[count];
1508                for(i=0; i<256; ++i) {
1509                    state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1510                }
1511                mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1512                mbcsTable->countStates=(uint8_t)(count+1);
1513                mbcsTable->stateTableOwned=TRUE;
1514
1515                mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1516            }
1517        }
1518
1519        /*
1520         * unlike below for files with base tables, do not get the unicodeMask
1521         * from the sharedData; instead, use the base table's unicodeMask,
1522         * which we copied in the memcpy above;
1523         * this is necessary because the static data unicodeMask, especially
1524         * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1525         */
1526    } else {
1527        /* conversion file with a base table; an additional extension table is optional */
1528        /* make sure that the output type is known */
1529        switch(mbcsTable->outputType) {
1530        case MBCS_OUTPUT_1:
1531        case MBCS_OUTPUT_2:
1532        case MBCS_OUTPUT_3:
1533        case MBCS_OUTPUT_4:
1534        case MBCS_OUTPUT_3_EUC:
1535        case MBCS_OUTPUT_4_EUC:
1536        case MBCS_OUTPUT_2_SISO:
1537            /* OK */
1538            break;
1539        default:
1540            *pErrorCode=U_INVALID_TABLE_FORMAT;
1541            return;
1542        }
1543        if(pArgs->onlyTestIsLoadable) {
1544            /*
1545             * Exit as soon as we know that we can load the converter
1546             * and the format is valid and supported.
1547             * The worst that can happen in the following code is a memory
1548             * allocation error.
1549             */
1550            return;
1551        }
1552
1553        mbcsTable->countStates=(uint8_t)header->countStates;
1554        mbcsTable->countToUFallbacks=header->countToUFallbacks;
1555        mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
1556        mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1557        mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1558
1559        mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1560        mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1561        mbcsTable->fromUBytesLength=header->fromUBytesLength;
1562
1563        /*
1564         * converter versions 6.1 and up contain a unicodeMask that is
1565         * used here to select the most efficient function implementations
1566         */
1567        info.size=sizeof(UDataInfo);
1568        udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1569        if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1570            /* mask off possible future extensions to be safe */
1571            mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1572        } else {
1573            /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1574            mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1575        }
1576
1577        /*
1578         * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1579         * Check for the header version, SBCS vs. MBCS, and for whether the
1580         * data structures are optimized for code points as high as what the
1581         * runtime code is designed for.
1582         * The implementation does not handle mapping tables with entries for
1583         * unpaired surrogates.
1584         */
1585        if( header->version[1]>=3 &&
1586            (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1587            (mbcsTable->countStates==1 ?
1588                (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1589                (header->version[2]>=(MBCS_FAST_MAX>>8))
1590            )
1591        ) {
1592            mbcsTable->utf8Friendly=TRUE;
1593
1594            if(mbcsTable->countStates==1) {
1595                /*
1596                 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1597                 * Build a table with indexes to each block, to be used instead of
1598                 * the regular stage 1/2 table.
1599                 */
1600                int32_t i;
1601                for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1602                    mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1603                }
1604                /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1605                mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1606            } else {
1607                /*
1608                 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1609                 * The .cnv file is prebuilt with an additional stage table with indexes
1610                 * to each block.
1611                 */
1612                mbcsTable->mbcsIndex=(const uint16_t *)
1613                    (mbcsTable->fromUnicodeBytes+
1614                     (noFromU ? 0 : mbcsTable->fromUBytesLength));
1615                mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1616            }
1617        }
1618
1619        /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1620        {
1621            uint32_t asciiRoundtrips=0xffffffff;
1622            int32_t i;
1623
1624            for(i=0; i<0x80; ++i) {
1625                if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1626                    asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1627                }
1628            }
1629            mbcsTable->asciiRoundtrips=asciiRoundtrips;
1630        }
1631
1632        if(noFromU) {
1633            uint32_t stage1Length=
1634                mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1635                    0x440 : 0x40;
1636            uint32_t stage2Length=
1637                (header->offsetFromUBytes-header->offsetFromUTable)/4-
1638                stage1Length/2;
1639            reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1640        }
1641    }
1642
1643    /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1644    if(mbcsTable->utf8Friendly) {
1645        if(mbcsTable->countStates==1) {
1646            sharedData->impl=&_SBCSUTF8Impl;
1647        } else {
1648            if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1649                sharedData->impl=&_DBCSUTF8Impl;
1650            }
1651        }
1652    }
1653
1654    if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1655        /*
1656         * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1657         * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1658         */
1659        mbcsTable->asciiRoundtrips=0;
1660    }
1661}
1662
1663static void
1664ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1665    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1666
1667    if(mbcsTable->swapLFNLStateTable!=NULL) {
1668        uprv_free(mbcsTable->swapLFNLStateTable);
1669    }
1670    if(mbcsTable->stateTableOwned) {
1671        uprv_free((void *)mbcsTable->stateTable);
1672    }
1673    if(mbcsTable->baseSharedData!=NULL) {
1674        ucnv_unload(mbcsTable->baseSharedData);
1675    }
1676    if(mbcsTable->reconstitutedData!=NULL) {
1677        uprv_free(mbcsTable->reconstitutedData);
1678    }
1679}
1680
1681static void
1682ucnv_MBCSOpen(UConverter *cnv,
1683              UConverterLoadArgs *pArgs,
1684              UErrorCode *pErrorCode) {
1685    UConverterMBCSTable *mbcsTable;
1686    const int32_t *extIndexes;
1687    uint8_t outputType;
1688    int8_t maxBytesPerUChar;
1689
1690    if(pArgs->onlyTestIsLoadable) {
1691        return;
1692    }
1693
1694    mbcsTable=&cnv->sharedData->mbcs;
1695    outputType=mbcsTable->outputType;
1696
1697    if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1698        /* the swaplfnl option does not apply, remove it */
1699        cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1700    }
1701
1702    if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1703        /* do this because double-checked locking is broken */
1704        UBool isCached;
1705
1706        umtx_lock(NULL);
1707        isCached=mbcsTable->swapLFNLStateTable!=NULL;
1708        umtx_unlock(NULL);
1709
1710        if(!isCached) {
1711            if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1712                if(U_FAILURE(*pErrorCode)) {
1713                    return; /* something went wrong */
1714                }
1715
1716                /* the option does not apply, remove it */
1717                cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1718            }
1719        }
1720    }
1721
1722    if(uprv_strstr(pArgs->name, "18030")!=NULL) {
1723        if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
1724            /* set a flag for GB 18030 mode, which changes the callback behavior */
1725            cnv->options|=_MBCS_OPTION_GB18030;
1726        }
1727    }
1728
1729    /* fix maxBytesPerUChar depending on outputType and options etc. */
1730    if(outputType==MBCS_OUTPUT_2_SISO) {
1731        cnv->maxBytesPerUChar=3; /* SO+DBCS */
1732    }
1733
1734    extIndexes=mbcsTable->extIndexes;
1735    if(extIndexes!=NULL) {
1736        maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1737        if(outputType==MBCS_OUTPUT_2_SISO) {
1738            ++maxBytesPerUChar; /* SO + multiple DBCS */
1739        }
1740
1741        if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1742            cnv->maxBytesPerUChar=maxBytesPerUChar;
1743        }
1744    }
1745
1746#if 0
1747    /*
1748     * documentation of UConverter fields used for status
1749     * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1750     */
1751
1752    /* toUnicode */
1753    cnv->toUnicodeStatus=0;     /* offset */
1754    cnv->mode=0;                /* state */
1755    cnv->toULength=0;           /* byteIndex */
1756
1757    /* fromUnicode */
1758    cnv->fromUChar32=0;
1759    cnv->fromUnicodeStatus=1;   /* prevLength */
1760#endif
1761}
1762
1763static const char *
1764ucnv_MBCSGetName(const UConverter *cnv) {
1765    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1766        return cnv->sharedData->mbcs.swapLFNLName;
1767    } else {
1768        return cnv->sharedData->staticData->name;
1769    }
1770}
1771
1772/* MBCS-to-Unicode conversion functions ------------------------------------- */
1773
1774static UChar32
1775ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
1776    const _MBCSToUFallback *toUFallbacks;
1777    uint32_t i, start, limit;
1778
1779    limit=mbcsTable->countToUFallbacks;
1780    if(limit>0) {
1781        /* do a binary search for the fallback mapping */
1782        toUFallbacks=mbcsTable->toUFallbacks;
1783        start=0;
1784        while(start<limit-1) {
1785            i=(start+limit)/2;
1786            if(offset<toUFallbacks[i].offset) {
1787                limit=i;
1788            } else {
1789                start=i;
1790            }
1791        }
1792
1793        /* did we really find it? */
1794        if(offset==toUFallbacks[start].offset) {
1795            return toUFallbacks[start].codePoint;
1796        }
1797    }
1798
1799    return 0xfffe;
1800}
1801
1802/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1803static void
1804ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1805                                UErrorCode *pErrorCode) {
1806    UConverter *cnv;
1807    const uint8_t *source, *sourceLimit;
1808    UChar *target;
1809    const UChar *targetLimit;
1810    int32_t *offsets;
1811
1812    const int32_t (*stateTable)[256];
1813
1814    int32_t sourceIndex;
1815
1816    int32_t entry;
1817    UChar c;
1818    uint8_t action;
1819
1820    /* set up the local pointers */
1821    cnv=pArgs->converter;
1822    source=(const uint8_t *)pArgs->source;
1823    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1824    target=pArgs->target;
1825    targetLimit=pArgs->targetLimit;
1826    offsets=pArgs->offsets;
1827
1828    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1829        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1830    } else {
1831        stateTable=cnv->sharedData->mbcs.stateTable;
1832    }
1833
1834    /* sourceIndex=-1 if the current character began in the previous buffer */
1835    sourceIndex=0;
1836
1837    /* conversion loop */
1838    while(source<sourceLimit) {
1839        /*
1840         * This following test is to see if available input would overflow the output.
1841         * It does not catch output of more than one code unit that
1842         * overflows as a result of a surrogate pair or callback output
1843         * from the last source byte.
1844         * Therefore, those situations also test for overflows and will
1845         * then break the loop, too.
1846         */
1847        if(target>=targetLimit) {
1848            /* target is full */
1849            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1850            break;
1851        }
1852
1853        entry=stateTable[0][*source++];
1854        /* MBCS_ENTRY_IS_FINAL(entry) */
1855
1856        /* test the most common case first */
1857        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1858            /* output BMP code point */
1859            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1860            if(offsets!=NULL) {
1861                *offsets++=sourceIndex;
1862            }
1863
1864            /* normal end of action codes: prepare for a new character */
1865            ++sourceIndex;
1866            continue;
1867        }
1868
1869        /*
1870         * An if-else-if chain provides more reliable performance for
1871         * the most common cases compared to a switch.
1872         */
1873        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1874        if(action==MBCS_STATE_VALID_DIRECT_20 ||
1875           (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1876        ) {
1877            entry=MBCS_ENTRY_FINAL_VALUE(entry);
1878            /* output surrogate pair */
1879            *target++=(UChar)(0xd800|(UChar)(entry>>10));
1880            if(offsets!=NULL) {
1881                *offsets++=sourceIndex;
1882            }
1883            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1884            if(target<targetLimit) {
1885                *target++=c;
1886                if(offsets!=NULL) {
1887                    *offsets++=sourceIndex;
1888                }
1889            } else {
1890                /* target overflow */
1891                cnv->UCharErrorBuffer[0]=c;
1892                cnv->UCharErrorBufferLength=1;
1893                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1894                break;
1895            }
1896
1897            ++sourceIndex;
1898            continue;
1899        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1900            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1901                /* output BMP code point */
1902                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1903                if(offsets!=NULL) {
1904                    *offsets++=sourceIndex;
1905                }
1906
1907                ++sourceIndex;
1908                continue;
1909            }
1910        } else if(action==MBCS_STATE_UNASSIGNED) {
1911            /* just fall through */
1912        } else if(action==MBCS_STATE_ILLEGAL) {
1913            /* callback(illegal) */
1914            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1915        } else {
1916            /* reserved, must never occur */
1917            ++sourceIndex;
1918            continue;
1919        }
1920
1921        if(U_FAILURE(*pErrorCode)) {
1922            /* callback(illegal) */
1923            break;
1924        } else /* unassigned sequences indicated with byteIndex>0 */ {
1925            /* try an extension mapping */
1926            pArgs->source=(const char *)source;
1927            cnv->toUBytes[0]=*(source-1);
1928            cnv->toULength=_extToU(cnv, cnv->sharedData,
1929                                    1, &source, sourceLimit,
1930                                    &target, targetLimit,
1931                                    &offsets, sourceIndex,
1932                                    pArgs->flush,
1933                                    pErrorCode);
1934            sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
1935
1936            if(U_FAILURE(*pErrorCode)) {
1937                /* not mappable or buffer overflow */
1938                break;
1939            }
1940        }
1941    }
1942
1943    /* write back the updated pointers */
1944    pArgs->source=(const char *)source;
1945    pArgs->target=target;
1946    pArgs->offsets=offsets;
1947}
1948
1949/*
1950 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1951 * that only map to and from the BMP.
1952 * In addition to single-byte optimizations, the offset calculations
1953 * become much easier.
1954 */
1955static void
1956ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1957                            UErrorCode *pErrorCode) {
1958    UConverter *cnv;
1959    const uint8_t *source, *sourceLimit, *lastSource;
1960    UChar *target;
1961    int32_t targetCapacity, length;
1962    int32_t *offsets;
1963
1964    const int32_t (*stateTable)[256];
1965
1966    int32_t sourceIndex;
1967
1968    int32_t entry;
1969    uint8_t action;
1970
1971    /* set up the local pointers */
1972    cnv=pArgs->converter;
1973    source=(const uint8_t *)pArgs->source;
1974    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1975    target=pArgs->target;
1976    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1977    offsets=pArgs->offsets;
1978
1979    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1980        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1981    } else {
1982        stateTable=cnv->sharedData->mbcs.stateTable;
1983    }
1984
1985    /* sourceIndex=-1 if the current character began in the previous buffer */
1986    sourceIndex=0;
1987    lastSource=source;
1988
1989    /*
1990     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1991     * for the minimum of the sourceLength and targetCapacity
1992     */
1993    length=(int32_t)(sourceLimit-source);
1994    if(length<targetCapacity) {
1995        targetCapacity=length;
1996    }
1997
1998#if MBCS_UNROLL_SINGLE_TO_BMP
1999    /* unrolling makes it faster on Pentium III/Windows 2000 */
2000    /* unroll the loop with the most common case */
2001unrolled:
2002    if(targetCapacity>=16) {
2003        int32_t count, loops, oredEntries;
2004
2005        loops=count=targetCapacity>>4;
2006        do {
2007            oredEntries=entry=stateTable[0][*source++];
2008            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2009            oredEntries|=entry=stateTable[0][*source++];
2010            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2011            oredEntries|=entry=stateTable[0][*source++];
2012            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2013            oredEntries|=entry=stateTable[0][*source++];
2014            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2015            oredEntries|=entry=stateTable[0][*source++];
2016            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2017            oredEntries|=entry=stateTable[0][*source++];
2018            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2019            oredEntries|=entry=stateTable[0][*source++];
2020            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2021            oredEntries|=entry=stateTable[0][*source++];
2022            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2023            oredEntries|=entry=stateTable[0][*source++];
2024            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2025            oredEntries|=entry=stateTable[0][*source++];
2026            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2027            oredEntries|=entry=stateTable[0][*source++];
2028            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2029            oredEntries|=entry=stateTable[0][*source++];
2030            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2031            oredEntries|=entry=stateTable[0][*source++];
2032            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2033            oredEntries|=entry=stateTable[0][*source++];
2034            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2035            oredEntries|=entry=stateTable[0][*source++];
2036            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2037            oredEntries|=entry=stateTable[0][*source++];
2038            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2039
2040            /* were all 16 entries really valid? */
2041            if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2042                /* no, return to the first of these 16 */
2043                source-=16;
2044                target-=16;
2045                break;
2046            }
2047        } while(--count>0);
2048        count=loops-count;
2049        targetCapacity-=16*count;
2050
2051        if(offsets!=NULL) {
2052            lastSource+=16*count;
2053            while(count>0) {
2054                *offsets++=sourceIndex++;
2055                *offsets++=sourceIndex++;
2056                *offsets++=sourceIndex++;
2057                *offsets++=sourceIndex++;
2058                *offsets++=sourceIndex++;
2059                *offsets++=sourceIndex++;
2060                *offsets++=sourceIndex++;
2061                *offsets++=sourceIndex++;
2062                *offsets++=sourceIndex++;
2063                *offsets++=sourceIndex++;
2064                *offsets++=sourceIndex++;
2065                *offsets++=sourceIndex++;
2066                *offsets++=sourceIndex++;
2067                *offsets++=sourceIndex++;
2068                *offsets++=sourceIndex++;
2069                *offsets++=sourceIndex++;
2070                --count;
2071            }
2072        }
2073    }
2074#endif
2075
2076    /* conversion loop */
2077    while(targetCapacity>0) {
2078        entry=stateTable[0][*source++];
2079        /* MBCS_ENTRY_IS_FINAL(entry) */
2080
2081        /* test the most common case first */
2082        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2083            /* output BMP code point */
2084            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2085            --targetCapacity;
2086            continue;
2087        }
2088
2089        /*
2090         * An if-else-if chain provides more reliable performance for
2091         * the most common cases compared to a switch.
2092         */
2093        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2094        if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2095            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2096                /* output BMP code point */
2097                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2098                --targetCapacity;
2099                continue;
2100            }
2101        } else if(action==MBCS_STATE_UNASSIGNED) {
2102            /* just fall through */
2103        } else if(action==MBCS_STATE_ILLEGAL) {
2104            /* callback(illegal) */
2105            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2106        } else {
2107            /* reserved, must never occur */
2108            continue;
2109        }
2110
2111        /* set offsets since the start or the last extension */
2112        if(offsets!=NULL) {
2113            int32_t count=(int32_t)(source-lastSource);
2114
2115            /* predecrement: do not set the offset for the callback-causing character */
2116            while(--count>0) {
2117                *offsets++=sourceIndex++;
2118            }
2119            /* offset and sourceIndex are now set for the current character */
2120        }
2121
2122        if(U_FAILURE(*pErrorCode)) {
2123            /* callback(illegal) */
2124            break;
2125        } else /* unassigned sequences indicated with byteIndex>0 */ {
2126            /* try an extension mapping */
2127            lastSource=source;
2128            cnv->toUBytes[0]=*(source-1);
2129            cnv->toULength=_extToU(cnv, cnv->sharedData,
2130                                    1, &source, sourceLimit,
2131                                    &target, pArgs->targetLimit,
2132                                    &offsets, sourceIndex,
2133                                    pArgs->flush,
2134                                    pErrorCode);
2135            sourceIndex+=1+(int32_t)(source-lastSource);
2136
2137            if(U_FAILURE(*pErrorCode)) {
2138                /* not mappable or buffer overflow */
2139                break;
2140            }
2141
2142            /* recalculate the targetCapacity after an extension mapping */
2143            targetCapacity=(int32_t)(pArgs->targetLimit-target);
2144            length=(int32_t)(sourceLimit-source);
2145            if(length<targetCapacity) {
2146                targetCapacity=length;
2147            }
2148        }
2149
2150#if MBCS_UNROLL_SINGLE_TO_BMP
2151        /* unrolling makes it faster on Pentium III/Windows 2000 */
2152        goto unrolled;
2153#endif
2154    }
2155
2156    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2157        /* target is full */
2158        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2159    }
2160
2161    /* set offsets since the start or the last callback */
2162    if(offsets!=NULL) {
2163        size_t count=source-lastSource;
2164        while(count>0) {
2165            *offsets++=sourceIndex++;
2166            --count;
2167        }
2168    }
2169
2170    /* write back the updated pointers */
2171    pArgs->source=(const char *)source;
2172    pArgs->target=target;
2173    pArgs->offsets=offsets;
2174}
2175
2176static UBool
2177hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2178    const int32_t *row=stateTable[state];
2179    int32_t b, entry;
2180    /* First test for final entries in this state for some commonly valid byte values. */
2181    entry=row[0xa1];
2182    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2183        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2184    ) {
2185        return TRUE;
2186    }
2187    entry=row[0x41];
2188    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2189        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2190    ) {
2191        return TRUE;
2192    }
2193    /* Then test for final entries in this state. */
2194    for(b=0; b<=0xff; ++b) {
2195        entry=row[b];
2196        if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2197            MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2198        ) {
2199            return TRUE;
2200        }
2201    }
2202    /* Then recurse for transition entries. */
2203    for(b=0; b<=0xff; ++b) {
2204        entry=row[b];
2205        if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2206            hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2207        ) {
2208            return TRUE;
2209        }
2210    }
2211    return FALSE;
2212}
2213
2214/*
2215 * Is byte b a single/lead byte in this state?
2216 * Recurse for transition states, because here we don't want to say that
2217 * b is a lead byte if all byte sequences that start with b are illegal.
2218 */
2219static UBool
2220isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2221    const int32_t *row=stateTable[state];
2222    int32_t entry=row[b];
2223    if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
2224        return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2225    } else {
2226        uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2227        if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2228            return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
2229        } else {
2230            return action!=MBCS_STATE_ILLEGAL;
2231        }
2232    }
2233}
2234
2235U_CFUNC void
2236ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2237                          UErrorCode *pErrorCode) {
2238    UConverter *cnv;
2239    const uint8_t *source, *sourceLimit;
2240    UChar *target;
2241    const UChar *targetLimit;
2242    int32_t *offsets;
2243
2244    const int32_t (*stateTable)[256];
2245    const uint16_t *unicodeCodeUnits;
2246
2247    uint32_t offset;
2248    uint8_t state;
2249    int8_t byteIndex;
2250    uint8_t *bytes;
2251
2252    int32_t sourceIndex, nextSourceIndex;
2253
2254    int32_t entry;
2255    UChar c;
2256    uint8_t action;
2257
2258    /* use optimized function if possible */
2259    cnv=pArgs->converter;
2260
2261    if(cnv->preToULength>0) {
2262        /*
2263         * pass sourceIndex=-1 because we continue from an earlier buffer
2264         * in the future, this may change with continuous offsets
2265         */
2266        ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2267
2268        if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2269            return;
2270        }
2271    }
2272
2273    if(cnv->sharedData->mbcs.countStates==1) {
2274        if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2275            ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2276        } else {
2277            ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2278        }
2279        return;
2280    }
2281
2282    /* set up the local pointers */
2283    source=(const uint8_t *)pArgs->source;
2284    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2285    target=pArgs->target;
2286    targetLimit=pArgs->targetLimit;
2287    offsets=pArgs->offsets;
2288
2289    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2290        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2291    } else {
2292        stateTable=cnv->sharedData->mbcs.stateTable;
2293    }
2294    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2295
2296    /* get the converter state from UConverter */
2297    offset=cnv->toUnicodeStatus;
2298    byteIndex=cnv->toULength;
2299    bytes=cnv->toUBytes;
2300
2301    /*
2302     * if we are in the SBCS state for a DBCS-only converter,
2303     * then load the DBCS state from the MBCS data
2304     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2305     */
2306    if((state=(uint8_t)(cnv->mode))==0) {
2307        state=cnv->sharedData->mbcs.dbcsOnlyState;
2308    }
2309
2310    /* sourceIndex=-1 if the current character began in the previous buffer */
2311    sourceIndex=byteIndex==0 ? 0 : -1;
2312    nextSourceIndex=0;
2313
2314    /* conversion loop */
2315    while(source<sourceLimit) {
2316        /*
2317         * This following test is to see if available input would overflow the output.
2318         * It does not catch output of more than one code unit that
2319         * overflows as a result of a surrogate pair or callback output
2320         * from the last source byte.
2321         * Therefore, those situations also test for overflows and will
2322         * then break the loop, too.
2323         */
2324        if(target>=targetLimit) {
2325            /* target is full */
2326            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2327            break;
2328        }
2329
2330        if(byteIndex==0) {
2331            /* optimized loop for 1/2-byte input and BMP output */
2332            if(offsets==NULL) {
2333                do {
2334                    entry=stateTable[state][*source];
2335                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2336                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2337                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2338
2339                        ++source;
2340                        if( source<sourceLimit &&
2341                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2342                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2343                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2344                        ) {
2345                            ++source;
2346                            *target++=c;
2347                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2348                            offset=0;
2349                        } else {
2350                            /* set the state and leave the optimized loop */
2351                            bytes[0]=*(source-1);
2352                            byteIndex=1;
2353                            break;
2354                        }
2355                    } else {
2356                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2357                            /* output BMP code point */
2358                            ++source;
2359                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2360                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2361                        } else {
2362                            /* leave the optimized loop */
2363                            break;
2364                        }
2365                    }
2366                } while(source<sourceLimit && target<targetLimit);
2367            } else /* offsets!=NULL */ {
2368                do {
2369                    entry=stateTable[state][*source];
2370                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2371                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2372                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2373
2374                        ++source;
2375                        if( source<sourceLimit &&
2376                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2377                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2378                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2379                        ) {
2380                            ++source;
2381                            *target++=c;
2382                            if(offsets!=NULL) {
2383                                *offsets++=sourceIndex;
2384                                sourceIndex=(nextSourceIndex+=2);
2385                            }
2386                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2387                            offset=0;
2388                        } else {
2389                            /* set the state and leave the optimized loop */
2390                            ++nextSourceIndex;
2391                            bytes[0]=*(source-1);
2392                            byteIndex=1;
2393                            break;
2394                        }
2395                    } else {
2396                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2397                            /* output BMP code point */
2398                            ++source;
2399                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2400                            if(offsets!=NULL) {
2401                                *offsets++=sourceIndex;
2402                                sourceIndex=++nextSourceIndex;
2403                            }
2404                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2405                        } else {
2406                            /* leave the optimized loop */
2407                            break;
2408                        }
2409                    }
2410                } while(source<sourceLimit && target<targetLimit);
2411            }
2412
2413            /*
2414             * these tests and break statements could be put inside the loop
2415             * if C had "break outerLoop" like Java
2416             */
2417            if(source>=sourceLimit) {
2418                break;
2419            }
2420            if(target>=targetLimit) {
2421                /* target is full */
2422                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2423                break;
2424            }
2425
2426            ++nextSourceIndex;
2427            bytes[byteIndex++]=*source++;
2428        } else /* byteIndex>0 */ {
2429            ++nextSourceIndex;
2430            entry=stateTable[state][bytes[byteIndex++]=*source++];
2431        }
2432
2433        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2434            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2435            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2436            continue;
2437        }
2438
2439        /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2440        cnv->mode=state;
2441
2442        /* set the next state early so that we can reuse the entry variable */
2443        state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2444
2445        /*
2446         * An if-else-if chain provides more reliable performance for
2447         * the most common cases compared to a switch.
2448         */
2449        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2450        if(action==MBCS_STATE_VALID_16) {
2451            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2452            c=unicodeCodeUnits[offset];
2453            if(c<0xfffe) {
2454                /* output BMP code point */
2455                *target++=c;
2456                if(offsets!=NULL) {
2457                    *offsets++=sourceIndex;
2458                }
2459                byteIndex=0;
2460            } else if(c==0xfffe) {
2461                if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2462                    /* output fallback BMP code point */
2463                    *target++=(UChar)entry;
2464                    if(offsets!=NULL) {
2465                        *offsets++=sourceIndex;
2466                    }
2467                    byteIndex=0;
2468                }
2469            } else {
2470                /* callback(illegal) */
2471                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2472            }
2473        } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2474            /* output BMP code point */
2475            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2476            if(offsets!=NULL) {
2477                *offsets++=sourceIndex;
2478            }
2479            byteIndex=0;
2480        } else if(action==MBCS_STATE_VALID_16_PAIR) {
2481            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2482            c=unicodeCodeUnits[offset++];
2483            if(c<0xd800) {
2484                /* output BMP code point below 0xd800 */
2485                *target++=c;
2486                if(offsets!=NULL) {
2487                    *offsets++=sourceIndex;
2488                }
2489                byteIndex=0;
2490            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2491                /* output roundtrip or fallback surrogate pair */
2492                *target++=(UChar)(c&0xdbff);
2493                if(offsets!=NULL) {
2494                    *offsets++=sourceIndex;
2495                }
2496                byteIndex=0;
2497                if(target<targetLimit) {
2498                    *target++=unicodeCodeUnits[offset];
2499                    if(offsets!=NULL) {
2500                        *offsets++=sourceIndex;
2501                    }
2502                } else {
2503                    /* target overflow */
2504                    cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2505                    cnv->UCharErrorBufferLength=1;
2506                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2507
2508                    offset=0;
2509                    break;
2510                }
2511            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2512                /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2513                *target++=unicodeCodeUnits[offset];
2514                if(offsets!=NULL) {
2515                    *offsets++=sourceIndex;
2516                }
2517                byteIndex=0;
2518            } else if(c==0xffff) {
2519                /* callback(illegal) */
2520                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2521            }
2522        } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2523                  (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2524        ) {
2525            entry=MBCS_ENTRY_FINAL_VALUE(entry);
2526            /* output surrogate pair */
2527            *target++=(UChar)(0xd800|(UChar)(entry>>10));
2528            if(offsets!=NULL) {
2529                *offsets++=sourceIndex;
2530            }
2531            byteIndex=0;
2532            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2533            if(target<targetLimit) {
2534                *target++=c;
2535                if(offsets!=NULL) {
2536                    *offsets++=sourceIndex;
2537                }
2538            } else {
2539                /* target overflow */
2540                cnv->UCharErrorBuffer[0]=c;
2541                cnv->UCharErrorBufferLength=1;
2542                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2543
2544                offset=0;
2545                break;
2546            }
2547        } else if(action==MBCS_STATE_CHANGE_ONLY) {
2548            /*
2549             * This serves as a state change without any output.
2550             * It is useful for reading simple stateful encodings,
2551             * for example using just Shift-In/Shift-Out codes.
2552             * The 21 unused bits may later be used for more sophisticated
2553             * state transitions.
2554             */
2555            if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2556                byteIndex=0;
2557            } else {
2558                /* SI/SO are illegal for DBCS-only conversion */
2559                state=(uint8_t)(cnv->mode); /* restore the previous state */
2560
2561                /* callback(illegal) */
2562                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2563            }
2564        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2565            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2566                /* output BMP code point */
2567                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2568                if(offsets!=NULL) {
2569                    *offsets++=sourceIndex;
2570                }
2571                byteIndex=0;
2572            }
2573        } else if(action==MBCS_STATE_UNASSIGNED) {
2574            /* just fall through */
2575        } else if(action==MBCS_STATE_ILLEGAL) {
2576            /* callback(illegal) */
2577            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2578        } else {
2579            /* reserved, must never occur */
2580            byteIndex=0;
2581        }
2582
2583        /* end of action codes: prepare for a new character */
2584        offset=0;
2585
2586        if(byteIndex==0) {
2587            sourceIndex=nextSourceIndex;
2588        } else if(U_FAILURE(*pErrorCode)) {
2589            /* callback(illegal) */
2590            if(byteIndex>1) {
2591                /*
2592                 * Ticket 5691: consistent illegal sequences:
2593                 * - We include at least the first byte in the illegal sequence.
2594                 * - If any of the non-initial bytes could be the start of a character,
2595                 *   we stop the illegal sequence before the first one of those.
2596                 */
2597                UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2598                int8_t i;
2599                for(i=1;
2600                    i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2601                    ++i) {}
2602                if(i<byteIndex) {
2603                    /* Back out some bytes. */
2604                    int8_t backOutDistance=byteIndex-i;
2605                    int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2606                    byteIndex=i;  /* length of reported illegal byte sequence */
2607                    if(backOutDistance<=bytesFromThisBuffer) {
2608                        source-=backOutDistance;
2609                    } else {
2610                        /* Back out bytes from the previous buffer: Need to replay them. */
2611                        cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2612                        /* preToULength is negative! */
2613                        uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2614                        source=(const uint8_t *)pArgs->source;
2615                    }
2616                }
2617            }
2618            break;
2619        } else /* unassigned sequences indicated with byteIndex>0 */ {
2620            /* try an extension mapping */
2621            pArgs->source=(const char *)source;
2622            byteIndex=_extToU(cnv, cnv->sharedData,
2623                              byteIndex, &source, sourceLimit,
2624                              &target, targetLimit,
2625                              &offsets, sourceIndex,
2626                              pArgs->flush,
2627                              pErrorCode);
2628            sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
2629
2630            if(U_FAILURE(*pErrorCode)) {
2631                /* not mappable or buffer overflow */
2632                break;
2633            }
2634        }
2635    }
2636
2637    /* set the converter state back into UConverter */
2638    cnv->toUnicodeStatus=offset;
2639    cnv->mode=state;
2640    cnv->toULength=byteIndex;
2641
2642    /* write back the updated pointers */
2643    pArgs->source=(const char *)source;
2644    pArgs->target=target;
2645    pArgs->offsets=offsets;
2646}
2647
2648/*
2649 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2650 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2651 */
2652static UChar32
2653ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2654                        UErrorCode *pErrorCode) {
2655    UConverter *cnv;
2656    const int32_t (*stateTable)[256];
2657    const uint8_t *source, *sourceLimit;
2658
2659    int32_t entry;
2660    uint8_t action;
2661
2662    /* set up the local pointers */
2663    cnv=pArgs->converter;
2664    source=(const uint8_t *)pArgs->source;
2665    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2666    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2667        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2668    } else {
2669        stateTable=cnv->sharedData->mbcs.stateTable;
2670    }
2671
2672    /* conversion loop */
2673    while(source<sourceLimit) {
2674        entry=stateTable[0][*source++];
2675        /* MBCS_ENTRY_IS_FINAL(entry) */
2676
2677        /* write back the updated pointer early so that we can return directly */
2678        pArgs->source=(const char *)source;
2679
2680        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2681            /* output BMP code point */
2682            return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2683        }
2684
2685        /*
2686         * An if-else-if chain provides more reliable performance for
2687         * the most common cases compared to a switch.
2688         */
2689        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2690        if( action==MBCS_STATE_VALID_DIRECT_20 ||
2691            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2692        ) {
2693            /* output supplementary code point */
2694            return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2695        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2696            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2697                /* output BMP code point */
2698                return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2699            }
2700        } else if(action==MBCS_STATE_UNASSIGNED) {
2701            /* just fall through */
2702        } else if(action==MBCS_STATE_ILLEGAL) {
2703            /* callback(illegal) */
2704            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2705        } else {
2706            /* reserved, must never occur */
2707            continue;
2708        }
2709
2710        if(U_FAILURE(*pErrorCode)) {
2711            /* callback(illegal) */
2712            break;
2713        } else /* unassigned sequence */ {
2714            /* defer to the generic implementation */
2715            pArgs->source=(const char *)source-1;
2716            return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2717        }
2718    }
2719
2720    /* no output because of empty input or only state changes */
2721    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2722    return 0xffff;
2723}
2724
2725/*
2726 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2727 * conversion without offset handling.
2728 *
2729 * When a character does not have a mapping to Unicode, then we return to the
2730 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2731 * handling.
2732 * We also defer to the generic code in other complicated cases and have them
2733 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2734 *
2735 * All normal mappings and errors are handled here.
2736 */
2737static UChar32
2738ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2739                  UErrorCode *pErrorCode) {
2740    UConverter *cnv;
2741    const uint8_t *source, *sourceLimit, *lastSource;
2742
2743    const int32_t (*stateTable)[256];
2744    const uint16_t *unicodeCodeUnits;
2745
2746    uint32_t offset;
2747    uint8_t state;
2748
2749    int32_t entry;
2750    UChar32 c;
2751    uint8_t action;
2752
2753    /* use optimized function if possible */
2754    cnv=pArgs->converter;
2755
2756    if(cnv->preToULength>0) {
2757        /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2758        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2759    }
2760
2761    if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2762        /*
2763         * Using the generic ucnv_getNextUChar() code lets us deal correctly
2764         * with the rare case of a codepage that maps single surrogates
2765         * without adding the complexity to this already complicated function here.
2766         */
2767        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2768    } else if(cnv->sharedData->mbcs.countStates==1) {
2769        return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2770    }
2771
2772    /* set up the local pointers */
2773    source=lastSource=(const uint8_t *)pArgs->source;
2774    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2775
2776    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2777        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2778    } else {
2779        stateTable=cnv->sharedData->mbcs.stateTable;
2780    }
2781    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2782
2783    /* get the converter state from UConverter */
2784    offset=cnv->toUnicodeStatus;
2785
2786    /*
2787     * if we are in the SBCS state for a DBCS-only converter,
2788     * then load the DBCS state from the MBCS data
2789     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2790     */
2791    if((state=(uint8_t)(cnv->mode))==0) {
2792        state=cnv->sharedData->mbcs.dbcsOnlyState;
2793    }
2794
2795    /* conversion loop */
2796    c=U_SENTINEL;
2797    while(source<sourceLimit) {
2798        entry=stateTable[state][*source++];
2799        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2800            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2801            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2802
2803            /* optimization for 1/2-byte input and BMP output */
2804            if( source<sourceLimit &&
2805                MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2806                MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2807                (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2808            ) {
2809                ++source;
2810                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2811                /* output BMP code point */
2812                break;
2813            }
2814        } else {
2815            /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2816            cnv->mode=state;
2817
2818            /* set the next state early so that we can reuse the entry variable */
2819            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2820
2821            /*
2822             * An if-else-if chain provides more reliable performance for
2823             * the most common cases compared to a switch.
2824             */
2825            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2826            if(action==MBCS_STATE_VALID_DIRECT_16) {
2827                /* output BMP code point */
2828                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2829                break;
2830            } else if(action==MBCS_STATE_VALID_16) {
2831                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2832                c=unicodeCodeUnits[offset];
2833                if(c<0xfffe) {
2834                    /* output BMP code point */
2835                    break;
2836                } else if(c==0xfffe) {
2837                    if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2838                        break;
2839                    }
2840                } else {
2841                    /* callback(illegal) */
2842                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2843                }
2844            } else if(action==MBCS_STATE_VALID_16_PAIR) {
2845                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2846                c=unicodeCodeUnits[offset++];
2847                if(c<0xd800) {
2848                    /* output BMP code point below 0xd800 */
2849                    break;
2850                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2851                    /* output roundtrip or fallback supplementary code point */
2852                    c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2853                    break;
2854                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2855                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2856                    c=unicodeCodeUnits[offset];
2857                    break;
2858                } else if(c==0xffff) {
2859                    /* callback(illegal) */
2860                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2861                }
2862            } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2863                      (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2864            ) {
2865                /* output supplementary code point */
2866                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2867                break;
2868            } else if(action==MBCS_STATE_CHANGE_ONLY) {
2869                /*
2870                 * This serves as a state change without any output.
2871                 * It is useful for reading simple stateful encodings,
2872                 * for example using just Shift-In/Shift-Out codes.
2873                 * The 21 unused bits may later be used for more sophisticated
2874                 * state transitions.
2875                 */
2876                if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2877                    /* SI/SO are illegal for DBCS-only conversion */
2878                    state=(uint8_t)(cnv->mode); /* restore the previous state */
2879
2880                    /* callback(illegal) */
2881                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2882                }
2883            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2884                if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2885                    /* output BMP code point */
2886                    c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2887                    break;
2888                }
2889            } else if(action==MBCS_STATE_UNASSIGNED) {
2890                /* just fall through */
2891            } else if(action==MBCS_STATE_ILLEGAL) {
2892                /* callback(illegal) */
2893                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2894            } else {
2895                /* reserved (must never occur), or only state change */
2896                offset=0;
2897                lastSource=source;
2898                continue;
2899            }
2900
2901            /* end of action codes: prepare for a new character */
2902            offset=0;
2903
2904            if(U_FAILURE(*pErrorCode)) {
2905                /* callback(illegal) */
2906                break;
2907            } else /* unassigned sequence */ {
2908                /* defer to the generic implementation */
2909                cnv->toUnicodeStatus=0;
2910                cnv->mode=state;
2911                pArgs->source=(const char *)lastSource;
2912                return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2913            }
2914        }
2915    }
2916
2917    if(c<0) {
2918        if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
2919            /* incomplete character byte sequence */
2920            uint8_t *bytes=cnv->toUBytes;
2921            cnv->toULength=(int8_t)(source-lastSource);
2922            do {
2923                *bytes++=*lastSource++;
2924            } while(lastSource<source);
2925            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2926        } else if(U_FAILURE(*pErrorCode)) {
2927            /* callback(illegal) */
2928            /*
2929             * Ticket 5691: consistent illegal sequences:
2930             * - We include at least the first byte in the illegal sequence.
2931             * - If any of the non-initial bytes could be the start of a character,
2932             *   we stop the illegal sequence before the first one of those.
2933             */
2934            UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2935            uint8_t *bytes=cnv->toUBytes;
2936            *bytes++=*lastSource++;     /* first byte */
2937            if(lastSource==source) {
2938                cnv->toULength=1;
2939            } else /* lastSource<source: multi-byte character */ {
2940                int8_t i;
2941                for(i=1;
2942                    lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
2943                    ++i
2944                ) {
2945                    *bytes++=*lastSource++;
2946                }
2947                cnv->toULength=i;
2948                source=lastSource;
2949            }
2950        } else {
2951            /* no output because of empty input or only state changes */
2952            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2953        }
2954        c=0xffff;
2955    }
2956
2957    /* set the converter state back into UConverter, ready for a new character */
2958    cnv->toUnicodeStatus=0;
2959    cnv->mode=state;
2960
2961    /* write back the updated pointer */
2962    pArgs->source=(const char *)source;
2963    return c;
2964}
2965
2966#if 0
2967/*
2968 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2969 * Removal improves code coverage.
2970 */
2971/**
2972 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2973 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2974 * It does not handle conversion extensions (_extToU()).
2975 */
2976U_CFUNC UChar32
2977ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2978                              uint8_t b, UBool useFallback) {
2979    int32_t entry;
2980    uint8_t action;
2981
2982    entry=sharedData->mbcs.stateTable[0][b];
2983    /* MBCS_ENTRY_IS_FINAL(entry) */
2984
2985    if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2986        /* output BMP code point */
2987        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2988    }
2989
2990    /*
2991     * An if-else-if chain provides more reliable performance for
2992     * the most common cases compared to a switch.
2993     */
2994    action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2995    if(action==MBCS_STATE_VALID_DIRECT_20) {
2996        /* output supplementary code point */
2997        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2998    } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2999        if(!TO_U_USE_FALLBACK(useFallback)) {
3000            return 0xfffe;
3001        }
3002        /* output BMP code point */
3003        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3004    } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3005        if(!TO_U_USE_FALLBACK(useFallback)) {
3006            return 0xfffe;
3007        }
3008        /* output supplementary code point */
3009        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3010    } else if(action==MBCS_STATE_UNASSIGNED) {
3011        return 0xfffe;
3012    } else if(action==MBCS_STATE_ILLEGAL) {
3013        return 0xffff;
3014    } else {
3015        /* reserved, must never occur */
3016        return 0xffff;
3017    }
3018}
3019#endif
3020
3021/*
3022 * This is a simple version of _MBCSGetNextUChar() that is used
3023 * by other converter implementations.
3024 * It only returns an "assigned" result if it consumes the entire input.
3025 * It does not use state from the converter, nor error codes.
3026 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3027 * It handles conversion extensions but not GB 18030.
3028 *
3029 * Return value:
3030 * U+fffe   unassigned
3031 * U+ffff   illegal
3032 * otherwise the Unicode code point
3033 */
3034U_CFUNC UChar32
3035ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3036                        const char *source, int32_t length,
3037                        UBool useFallback) {
3038    const int32_t (*stateTable)[256];
3039    const uint16_t *unicodeCodeUnits;
3040
3041    uint32_t offset;
3042    uint8_t state, action;
3043
3044    UChar32 c;
3045    int32_t i, entry;
3046
3047    if(length<=0) {
3048        /* no input at all: "illegal" */
3049        return 0xffff;
3050    }
3051
3052#if 0
3053/*
3054 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3055 * TODO In future releases, verify that this function is never called for SBCS
3056 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3057 * Removal improves code coverage.
3058 */
3059    /* use optimized function if possible */
3060    if(sharedData->mbcs.countStates==1) {
3061        if(length==1) {
3062            return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3063        } else {
3064            return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3065        }
3066    }
3067#endif
3068
3069    /* set up the local pointers */
3070    stateTable=sharedData->mbcs.stateTable;
3071    unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3072
3073    /* converter state */
3074    offset=0;
3075    state=sharedData->mbcs.dbcsOnlyState;
3076
3077    /* conversion loop */
3078    for(i=0;;) {
3079        entry=stateTable[state][(uint8_t)source[i++]];
3080        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3081            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3082            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3083
3084            if(i==length) {
3085                return 0xffff; /* truncated character */
3086            }
3087        } else {
3088            /*
3089             * An if-else-if chain provides more reliable performance for
3090             * the most common cases compared to a switch.
3091             */
3092            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3093            if(action==MBCS_STATE_VALID_16) {
3094                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3095                c=unicodeCodeUnits[offset];
3096                if(c!=0xfffe) {
3097                    /* done */
3098                } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3099                    c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3100                /* else done with 0xfffe */
3101                }
3102                break;
3103            } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3104                /* output BMP code point */
3105                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3106                break;
3107            } else if(action==MBCS_STATE_VALID_16_PAIR) {
3108                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3109                c=unicodeCodeUnits[offset++];
3110                if(c<0xd800) {
3111                    /* output BMP code point below 0xd800 */
3112                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3113                    /* output roundtrip or fallback supplementary code point */
3114                    c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3115                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3116                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3117                    c=unicodeCodeUnits[offset];
3118                } else if(c==0xffff) {
3119                    return 0xffff;
3120                } else {
3121                    c=0xfffe;
3122                }
3123                break;
3124            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3125                /* output supplementary code point */
3126                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3127                break;
3128            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3129                if(!TO_U_USE_FALLBACK(useFallback)) {
3130                    c=0xfffe;
3131                    break;
3132                }
3133                /* output BMP code point */
3134                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3135                break;
3136            } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3137                if(!TO_U_USE_FALLBACK(useFallback)) {
3138                    c=0xfffe;
3139                    break;
3140                }
3141                /* output supplementary code point */
3142                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3143                break;
3144            } else if(action==MBCS_STATE_UNASSIGNED) {
3145                c=0xfffe;
3146                break;
3147            }
3148
3149            /*
3150             * forbid MBCS_STATE_CHANGE_ONLY for this function,
3151             * and MBCS_STATE_ILLEGAL and reserved action codes
3152             */
3153            return 0xffff;
3154        }
3155    }
3156
3157    if(i!=length) {
3158        /* illegal for this function: not all input consumed */
3159        return 0xffff;
3160    }
3161
3162    if(c==0xfffe) {
3163        /* try an extension mapping */
3164        const int32_t *cx=sharedData->mbcs.extIndexes;
3165        if(cx!=NULL) {
3166            return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3167        }
3168    }
3169
3170    return c;
3171}
3172
3173/* MBCS-from-Unicode conversion functions ----------------------------------- */
3174
3175/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3176static void
3177ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3178                                  UErrorCode *pErrorCode) {
3179    UConverter *cnv;
3180    const UChar *source, *sourceLimit;
3181    uint8_t *target;
3182    int32_t targetCapacity;
3183    int32_t *offsets;
3184
3185    const uint16_t *table;
3186    const uint16_t *mbcsIndex;
3187    const uint8_t *bytes;
3188
3189    UChar32 c;
3190
3191    int32_t sourceIndex, nextSourceIndex;
3192
3193    uint32_t stage2Entry;
3194    uint32_t asciiRoundtrips;
3195    uint32_t value;
3196    uint8_t unicodeMask;
3197
3198    /* use optimized function if possible */
3199    cnv=pArgs->converter;
3200    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3201
3202    /* set up the local pointers */
3203    source=pArgs->source;
3204    sourceLimit=pArgs->sourceLimit;
3205    target=(uint8_t *)pArgs->target;
3206    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3207    offsets=pArgs->offsets;
3208
3209    table=cnv->sharedData->mbcs.fromUnicodeTable;
3210    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3211    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3212        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3213    } else {
3214        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3215    }
3216    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3217
3218    /* get the converter state from UConverter */
3219    c=cnv->fromUChar32;
3220
3221    /* sourceIndex=-1 if the current character began in the previous buffer */
3222    sourceIndex= c==0 ? 0 : -1;
3223    nextSourceIndex=0;
3224
3225    /* conversion loop */
3226    if(c!=0 && targetCapacity>0) {
3227        goto getTrail;
3228    }
3229
3230    while(source<sourceLimit) {
3231        /*
3232         * This following test is to see if available input would overflow the output.
3233         * It does not catch output of more than one byte that
3234         * overflows as a result of a multi-byte character or callback output
3235         * from the last source character.
3236         * Therefore, those situations also test for overflows and will
3237         * then break the loop, too.
3238         */
3239        if(targetCapacity>0) {
3240            /*
3241             * Get a correct Unicode code point:
3242             * a single UChar for a BMP code point or
3243             * a matched surrogate pair for a "supplementary code point".
3244             */
3245            c=*source++;
3246            ++nextSourceIndex;
3247            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3248                *target++=(uint8_t)c;
3249                if(offsets!=NULL) {
3250                    *offsets++=sourceIndex;
3251                    sourceIndex=nextSourceIndex;
3252                }
3253                --targetCapacity;
3254                c=0;
3255                continue;
3256            }
3257            /*
3258             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3259             * to avoid dealing with surrogates.
3260             * MBCS_FAST_MAX must be >=0xd7ff.
3261             */
3262            if(c<=0xd7ff) {
3263                value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3264                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3265                if(value==0) {
3266                    goto unassigned;
3267                }
3268                /* output the value */
3269            } else {
3270                /*
3271                 * This also tests if the codepage maps single surrogates.
3272                 * If it does, then surrogates are not paired but mapped separately.
3273                 * Note that in this case unmatched surrogates are not detected.
3274                 */
3275                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3276                    if(UTF_IS_SURROGATE_FIRST(c)) {
3277getTrail:
3278                        if(source<sourceLimit) {
3279                            /* test the following code unit */
3280                            UChar trail=*source;
3281                            if(UTF_IS_SECOND_SURROGATE(trail)) {
3282                                ++source;
3283                                ++nextSourceIndex;
3284                                c=UTF16_GET_PAIR_VALUE(c, trail);
3285                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3286                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3287                                    /* callback(unassigned) */
3288                                    goto unassigned;
3289                                }
3290                                /* convert this supplementary code point */
3291                                /* exit this condition tree */
3292                            } else {
3293                                /* this is an unmatched lead code unit (1st surrogate) */
3294                                /* callback(illegal) */
3295                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3296                                break;
3297                            }
3298                        } else {
3299                            /* no more input */
3300                            break;
3301                        }
3302                    } else {
3303                        /* this is an unmatched trail code unit (2nd surrogate) */
3304                        /* callback(illegal) */
3305                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3306                        break;
3307                    }
3308                }
3309
3310                /* convert the Unicode code point in c into codepage bytes */
3311                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3312
3313                /* get the bytes and the length for the output */
3314                /* MBCS_OUTPUT_2 */
3315                value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3316
3317                /* is this code point assigned, or do we use fallbacks? */
3318                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3319                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3320                ) {
3321                    /*
3322                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
3323                     * There is no way with this data structure for fallback output
3324                     * to be a zero byte.
3325                     */
3326
3327unassigned:
3328                    /* try an extension mapping */
3329                    pArgs->source=source;
3330                    c=_extFromU(cnv, cnv->sharedData,
3331                                c, &source, sourceLimit,
3332                                &target, target+targetCapacity,
3333                                &offsets, sourceIndex,
3334                                pArgs->flush,
3335                                pErrorCode);
3336                    nextSourceIndex+=(int32_t)(source-pArgs->source);
3337
3338                    if(U_FAILURE(*pErrorCode)) {
3339                        /* not mappable or buffer overflow */
3340                        break;
3341                    } else {
3342                        /* a mapping was written to the target, continue */
3343
3344                        /* recalculate the targetCapacity after an extension mapping */
3345                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3346
3347                        /* normal end of conversion: prepare for a new character */
3348                        sourceIndex=nextSourceIndex;
3349                        continue;
3350                    }
3351                }
3352            }
3353
3354            /* write the output character bytes from value and length */
3355            /* from the first if in the loop we know that targetCapacity>0 */
3356            if(value<=0xff) {
3357                /* this is easy because we know that there is enough space */
3358                *target++=(uint8_t)value;
3359                if(offsets!=NULL) {
3360                    *offsets++=sourceIndex;
3361                }
3362                --targetCapacity;
3363            } else /* length==2 */ {
3364                *target++=(uint8_t)(value>>8);
3365                if(2<=targetCapacity) {
3366                    *target++=(uint8_t)value;
3367                    if(offsets!=NULL) {
3368                        *offsets++=sourceIndex;
3369                        *offsets++=sourceIndex;
3370                    }
3371                    targetCapacity-=2;
3372                } else {
3373                    if(offsets!=NULL) {
3374                        *offsets++=sourceIndex;
3375                    }
3376                    cnv->charErrorBuffer[0]=(char)value;
3377                    cnv->charErrorBufferLength=1;
3378
3379                    /* target overflow */
3380                    targetCapacity=0;
3381                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3382                    c=0;
3383                    break;
3384                }
3385            }
3386
3387            /* normal end of conversion: prepare for a new character */
3388            c=0;
3389            sourceIndex=nextSourceIndex;
3390            continue;
3391        } else {
3392            /* target is full */
3393            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3394            break;
3395        }
3396    }
3397
3398    /* set the converter state back into UConverter */
3399    cnv->fromUChar32=c;
3400
3401    /* write back the updated pointers */
3402    pArgs->source=source;
3403    pArgs->target=(char *)target;
3404    pArgs->offsets=offsets;
3405}
3406
3407/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3408static void
3409ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3410                                  UErrorCode *pErrorCode) {
3411    UConverter *cnv;
3412    const UChar *source, *sourceLimit;
3413    uint8_t *target;
3414    int32_t targetCapacity;
3415    int32_t *offsets;
3416
3417    const uint16_t *table;
3418    const uint16_t *results;
3419
3420    UChar32 c;
3421
3422    int32_t sourceIndex, nextSourceIndex;
3423
3424    uint16_t value, minValue;
3425    UBool hasSupplementary;
3426
3427    /* set up the local pointers */
3428    cnv=pArgs->converter;
3429    source=pArgs->source;
3430    sourceLimit=pArgs->sourceLimit;
3431    target=(uint8_t *)pArgs->target;
3432    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3433    offsets=pArgs->offsets;
3434
3435    table=cnv->sharedData->mbcs.fromUnicodeTable;
3436    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3437        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3438    } else {
3439        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3440    }
3441
3442    if(cnv->useFallback) {
3443        /* use all roundtrip and fallback results */
3444        minValue=0x800;
3445    } else {
3446        /* use only roundtrips and fallbacks from private-use characters */
3447        minValue=0xc00;
3448    }
3449    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
3450
3451    /* get the converter state from UConverter */
3452    c=cnv->fromUChar32;
3453
3454    /* sourceIndex=-1 if the current character began in the previous buffer */
3455    sourceIndex= c==0 ? 0 : -1;
3456    nextSourceIndex=0;
3457
3458    /* conversion loop */
3459    if(c!=0 && targetCapacity>0) {
3460        goto getTrail;
3461    }
3462
3463    while(source<sourceLimit) {
3464        /*
3465         * This following test is to see if available input would overflow the output.
3466         * It does not catch output of more than one byte that
3467         * overflows as a result of a multi-byte character or callback output
3468         * from the last source character.
3469         * Therefore, those situations also test for overflows and will
3470         * then break the loop, too.
3471         */
3472        if(targetCapacity>0) {
3473            /*
3474             * Get a correct Unicode code point:
3475             * a single UChar for a BMP code point or
3476             * a matched surrogate pair for a "supplementary code point".
3477             */
3478            c=*source++;
3479            ++nextSourceIndex;
3480            if(UTF_IS_SURROGATE(c)) {
3481                if(UTF_IS_SURROGATE_FIRST(c)) {
3482getTrail:
3483                    if(source<sourceLimit) {
3484                        /* test the following code unit */
3485                        UChar trail=*source;
3486                        if(UTF_IS_SECOND_SURROGATE(trail)) {
3487                            ++source;
3488                            ++nextSourceIndex;
3489                            c=UTF16_GET_PAIR_VALUE(c, trail);
3490                            if(!hasSupplementary) {
3491                                /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3492                                /* callback(unassigned) */
3493                                goto unassigned;
3494                            }
3495                            /* convert this supplementary code point */
3496                            /* exit this condition tree */
3497                        } else {
3498                            /* this is an unmatched lead code unit (1st surrogate) */
3499                            /* callback(illegal) */
3500                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3501                            break;
3502                        }
3503                    } else {
3504                        /* no more input */
3505                        break;
3506                    }
3507                } else {
3508                    /* this is an unmatched trail code unit (2nd surrogate) */
3509                    /* callback(illegal) */
3510                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3511                    break;
3512                }
3513            }
3514
3515            /* convert the Unicode code point in c into codepage bytes */
3516            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3517
3518            /* is this code point assigned, or do we use fallbacks? */
3519            if(value>=minValue) {
3520                /* assigned, write the output character bytes from value and length */
3521                /* length==1 */
3522                /* this is easy because we know that there is enough space */
3523                *target++=(uint8_t)value;
3524                if(offsets!=NULL) {
3525                    *offsets++=sourceIndex;
3526                }
3527                --targetCapacity;
3528
3529                /* normal end of conversion: prepare for a new character */
3530                c=0;
3531                sourceIndex=nextSourceIndex;
3532            } else { /* unassigned */
3533unassigned:
3534                /* try an extension mapping */
3535                pArgs->source=source;
3536                c=_extFromU(cnv, cnv->sharedData,
3537                            c, &source, sourceLimit,
3538                            &target, target+targetCapacity,
3539                            &offsets, sourceIndex,
3540                            pArgs->flush,
3541                            pErrorCode);
3542                nextSourceIndex+=(int32_t)(source-pArgs->source);
3543
3544                if(U_FAILURE(*pErrorCode)) {
3545                    /* not mappable or buffer overflow */
3546                    break;
3547                } else {
3548                    /* a mapping was written to the target, continue */
3549
3550                    /* recalculate the targetCapacity after an extension mapping */
3551                    targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3552
3553                    /* normal end of conversion: prepare for a new character */
3554                    sourceIndex=nextSourceIndex;
3555                }
3556            }
3557        } else {
3558            /* target is full */
3559            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3560            break;
3561        }
3562    }
3563
3564    /* set the converter state back into UConverter */
3565    cnv->fromUChar32=c;
3566
3567    /* write back the updated pointers */
3568    pArgs->source=source;
3569    pArgs->target=(char *)target;
3570    pArgs->offsets=offsets;
3571}
3572
3573/*
3574 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3575 * that map only to and from the BMP.
3576 * In addition to single-byte/state optimizations, the offset calculations
3577 * become much easier.
3578 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3579 * but measurements have shown that this diminishes performance
3580 * in more cases than it improves it.
3581 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3582 * for various MBCS and SBCS optimizations.
3583 */
3584static void
3585ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3586                              UErrorCode *pErrorCode) {
3587    UConverter *cnv;
3588    const UChar *source, *sourceLimit, *lastSource;
3589    uint8_t *target;
3590    int32_t targetCapacity, length;
3591    int32_t *offsets;
3592
3593    const uint16_t *table;
3594    const uint16_t *results;
3595
3596    UChar32 c;
3597
3598    int32_t sourceIndex;
3599
3600    uint32_t asciiRoundtrips;
3601    uint16_t value, minValue;
3602
3603    /* set up the local pointers */
3604    cnv=pArgs->converter;
3605    source=pArgs->source;
3606    sourceLimit=pArgs->sourceLimit;
3607    target=(uint8_t *)pArgs->target;
3608    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3609    offsets=pArgs->offsets;
3610
3611    table=cnv->sharedData->mbcs.fromUnicodeTable;
3612    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3613        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3614    } else {
3615        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3616    }
3617    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3618
3619    if(cnv->useFallback) {
3620        /* use all roundtrip and fallback results */
3621        minValue=0x800;
3622    } else {
3623        /* use only roundtrips and fallbacks from private-use characters */
3624        minValue=0xc00;
3625    }
3626
3627    /* get the converter state from UConverter */
3628    c=cnv->fromUChar32;
3629
3630    /* sourceIndex=-1 if the current character began in the previous buffer */
3631    sourceIndex= c==0 ? 0 : -1;
3632    lastSource=source;
3633
3634    /*
3635     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3636     * for the minimum of the sourceLength and targetCapacity
3637     */
3638    length=(int32_t)(sourceLimit-source);
3639    if(length<targetCapacity) {
3640        targetCapacity=length;
3641    }
3642
3643    /* conversion loop */
3644    if(c!=0 && targetCapacity>0) {
3645        goto getTrail;
3646    }
3647
3648#if MBCS_UNROLL_SINGLE_FROM_BMP
3649    /* unrolling makes it slower on Pentium III/Windows 2000?! */
3650    /* unroll the loop with the most common case */
3651unrolled:
3652    if(targetCapacity>=4) {
3653        int32_t count, loops;
3654        uint16_t andedValues;
3655
3656        loops=count=targetCapacity>>2;
3657        do {
3658            c=*source++;
3659            andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3660            *target++=(uint8_t)value;
3661            c=*source++;
3662            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3663            *target++=(uint8_t)value;
3664            c=*source++;
3665            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3666            *target++=(uint8_t)value;
3667            c=*source++;
3668            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3669            *target++=(uint8_t)value;
3670
3671            /* were all 4 entries really valid? */
3672            if(andedValues<minValue) {
3673                /* no, return to the first of these 4 */
3674                source-=4;
3675                target-=4;
3676                break;
3677            }
3678        } while(--count>0);
3679        count=loops-count;
3680        targetCapacity-=4*count;
3681
3682        if(offsets!=NULL) {
3683            lastSource+=4*count;
3684            while(count>0) {
3685                *offsets++=sourceIndex++;
3686                *offsets++=sourceIndex++;
3687                *offsets++=sourceIndex++;
3688                *offsets++=sourceIndex++;
3689                --count;
3690            }
3691        }
3692
3693        c=0;
3694    }
3695#endif
3696
3697    while(targetCapacity>0) {
3698        /*
3699         * Get a correct Unicode code point:
3700         * a single UChar for a BMP code point or
3701         * a matched surrogate pair for a "supplementary code point".
3702         */
3703        c=*source++;
3704        /*
3705         * Do not immediately check for single surrogates:
3706         * Assume that they are unassigned and check for them in that case.
3707         * This speeds up the conversion of assigned characters.
3708         */
3709        /* convert the Unicode code point in c into codepage bytes */
3710        if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3711            *target++=(uint8_t)c;
3712            --targetCapacity;
3713            c=0;
3714            continue;
3715        }
3716        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3717        /* is this code point assigned, or do we use fallbacks? */
3718        if(value>=minValue) {
3719            /* assigned, write the output character bytes from value and length */
3720            /* length==1 */
3721            /* this is easy because we know that there is enough space */
3722            *target++=(uint8_t)value;
3723            --targetCapacity;
3724
3725            /* normal end of conversion: prepare for a new character */
3726            c=0;
3727            continue;
3728        } else if(!UTF_IS_SURROGATE(c)) {
3729            /* normal, unassigned BMP character */
3730        } else if(UTF_IS_SURROGATE_FIRST(c)) {
3731getTrail:
3732            if(source<sourceLimit) {
3733                /* test the following code unit */
3734                UChar trail=*source;
3735                if(UTF_IS_SECOND_SURROGATE(trail)) {
3736                    ++source;
3737                    c=UTF16_GET_PAIR_VALUE(c, trail);
3738                    /* this codepage does not map supplementary code points */
3739                    /* callback(unassigned) */
3740                } else {
3741                    /* this is an unmatched lead code unit (1st surrogate) */
3742                    /* callback(illegal) */
3743                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3744                    break;
3745                }
3746            } else {
3747                /* no more input */
3748                if (pArgs->flush) {
3749                    *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3750                }
3751                break;
3752            }
3753        } else {
3754            /* this is an unmatched trail code unit (2nd surrogate) */
3755            /* callback(illegal) */
3756            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3757            break;
3758        }
3759
3760        /* c does not have a mapping */
3761
3762        /* get the number of code units for c to correctly advance sourceIndex */
3763        length=U16_LENGTH(c);
3764
3765        /* set offsets since the start or the last extension */
3766        if(offsets!=NULL) {
3767            int32_t count=(int32_t)(source-lastSource);
3768
3769            /* do not set the offset for this character */
3770            count-=length;
3771
3772            while(count>0) {
3773                *offsets++=sourceIndex++;
3774                --count;
3775            }
3776            /* offsets and sourceIndex are now set for the current character */
3777        }
3778
3779        /* try an extension mapping */
3780        lastSource=source;
3781        c=_extFromU(cnv, cnv->sharedData,
3782                    c, &source, sourceLimit,
3783                    &target, (const uint8_t *)(pArgs->targetLimit),
3784                    &offsets, sourceIndex,
3785                    pArgs->flush,
3786                    pErrorCode);
3787        sourceIndex+=length+(int32_t)(source-lastSource);
3788        lastSource=source;
3789
3790        if(U_FAILURE(*pErrorCode)) {
3791            /* not mappable or buffer overflow */
3792            break;
3793        } else {
3794            /* a mapping was written to the target, continue */
3795
3796            /* recalculate the targetCapacity after an extension mapping */
3797            targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3798            length=(int32_t)(sourceLimit-source);
3799            if(length<targetCapacity) {
3800                targetCapacity=length;
3801            }
3802        }
3803
3804#if MBCS_UNROLL_SINGLE_FROM_BMP
3805        /* unrolling makes it slower on Pentium III/Windows 2000?! */
3806        goto unrolled;
3807#endif
3808    }
3809
3810    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3811        /* target is full */
3812        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3813    }
3814
3815    /* set offsets since the start or the last callback */
3816    if(offsets!=NULL) {
3817        size_t count=source-lastSource;
3818        if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
3819            /*
3820            Caller gave us a partial supplementary character,
3821            which this function couldn't convert in any case.
3822            The callback will handle the offset.
3823            */
3824            count--;
3825        }
3826        while(count>0) {
3827            *offsets++=sourceIndex++;
3828            --count;
3829        }
3830    }
3831
3832    /* set the converter state back into UConverter */
3833    cnv->fromUChar32=c;
3834
3835    /* write back the updated pointers */
3836    pArgs->source=source;
3837    pArgs->target=(char *)target;
3838    pArgs->offsets=offsets;
3839}
3840
3841U_CFUNC void
3842ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3843                            UErrorCode *pErrorCode) {
3844    UConverter *cnv;
3845    const UChar *source, *sourceLimit;
3846    uint8_t *target;
3847    int32_t targetCapacity;
3848    int32_t *offsets;
3849
3850    const uint16_t *table;
3851    const uint16_t *mbcsIndex;
3852    const uint8_t *p, *bytes;
3853    uint8_t outputType;
3854
3855    UChar32 c;
3856
3857    int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3858
3859    uint32_t stage2Entry;
3860    uint32_t asciiRoundtrips;
3861    uint32_t value;
3862    int32_t length, prevLength;
3863    uint8_t unicodeMask;
3864
3865    cnv=pArgs->converter;
3866
3867    if(cnv->preFromUFirstCP>=0) {
3868        /*
3869         * pass sourceIndex=-1 because we continue from an earlier buffer
3870         * in the future, this may change with continuous offsets
3871         */
3872        ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3873
3874        if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3875            return;
3876        }
3877    }
3878
3879    /* use optimized function if possible */
3880    outputType=cnv->sharedData->mbcs.outputType;
3881    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3882    if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3883        if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3884            ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3885        } else {
3886            ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3887        }
3888        return;
3889    } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
3890        ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3891        return;
3892    }
3893
3894    /* set up the local pointers */
3895    source=pArgs->source;
3896    sourceLimit=pArgs->sourceLimit;
3897    target=(uint8_t *)pArgs->target;
3898    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3899    offsets=pArgs->offsets;
3900
3901    table=cnv->sharedData->mbcs.fromUnicodeTable;
3902    if(cnv->sharedData->mbcs.utf8Friendly) {
3903        mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3904    } else {
3905        mbcsIndex=NULL;
3906    }
3907    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3908        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3909    } else {
3910        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3911    }
3912    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3913
3914    /* get the converter state from UConverter */
3915    c=cnv->fromUChar32;
3916
3917    if(outputType==MBCS_OUTPUT_2_SISO) {
3918        prevLength=cnv->fromUnicodeStatus;
3919        if(prevLength==0) {
3920            /* set the real value */
3921            prevLength=1;
3922        }
3923    } else {
3924        /* prevent fromUnicodeStatus from being set to something non-0 */
3925        prevLength=0;
3926    }
3927
3928    /* sourceIndex=-1 if the current character began in the previous buffer */
3929    prevSourceIndex=-1;
3930    sourceIndex= c==0 ? 0 : -1;
3931    nextSourceIndex=0;
3932
3933    /* conversion loop */
3934    /*
3935     * This is another piece of ugly code:
3936     * A goto into the loop if the converter state contains a first surrogate
3937     * from the previous function call.
3938     * It saves me to check in each loop iteration a check of if(c==0)
3939     * and duplicating the trail-surrogate-handling code in the else
3940     * branch of that check.
3941     * I could not find any other way to get around this other than
3942     * using a function call for the conversion and callback, which would
3943     * be even more inefficient.
3944     *
3945     * Markus Scherer 2000-jul-19
3946     */
3947    if(c!=0 && targetCapacity>0) {
3948        goto getTrail;
3949    }
3950
3951    while(source<sourceLimit) {
3952        /*
3953         * This following test is to see if available input would overflow the output.
3954         * It does not catch output of more than one byte that
3955         * overflows as a result of a multi-byte character or callback output
3956         * from the last source character.
3957         * Therefore, those situations also test for overflows and will
3958         * then break the loop, too.
3959         */
3960        if(targetCapacity>0) {
3961            /*
3962             * Get a correct Unicode code point:
3963             * a single UChar for a BMP code point or
3964             * a matched surrogate pair for a "supplementary code point".
3965             */
3966            c=*source++;
3967            ++nextSourceIndex;
3968            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3969                *target++=(uint8_t)c;
3970                if(offsets!=NULL) {
3971                    *offsets++=sourceIndex;
3972                    prevSourceIndex=sourceIndex;
3973                    sourceIndex=nextSourceIndex;
3974                }
3975                --targetCapacity;
3976                c=0;
3977                continue;
3978            }
3979            /*
3980             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3981             * to avoid dealing with surrogates.
3982             * MBCS_FAST_MAX must be >=0xd7ff.
3983             */
3984            if(c<=0xd7ff && mbcsIndex!=NULL) {
3985                value=mbcsIndex[c>>6];
3986
3987                /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
3988                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3989                switch(outputType) {
3990                case MBCS_OUTPUT_2:
3991                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
3992                    if(value<=0xff) {
3993                        if(value==0) {
3994                            goto unassigned;
3995                        } else {
3996                            length=1;
3997                        }
3998                    } else {
3999                        length=2;
4000                    }
4001                    break;
4002                case MBCS_OUTPUT_2_SISO:
4003                    /* 1/2-byte stateful with Shift-In/Shift-Out */
4004                    /*
4005                     * Save the old state in the converter object
4006                     * right here, then change the local prevLength state variable if necessary.
4007                     * Then, if this character turns out to be unassigned or a fallback that
4008                     * is not taken, the callback code must not save the new state in the converter
4009                     * because the new state is for a character that is not output.
4010                     * However, the callback must still restore the state from the converter
4011                     * in case the callback function changed it for its output.
4012                     */
4013                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4014                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4015                    if(value<=0xff) {
4016                        if(value==0) {
4017                            goto unassigned;
4018                        } else if(prevLength<=1) {
4019                            length=1;
4020                        } else {
4021                            /* change from double-byte mode to single-byte */
4022                            value|=(uint32_t)UCNV_SI<<8;
4023                            length=2;
4024                            prevLength=1;
4025                        }
4026                    } else {
4027                        if(prevLength==2) {
4028                            length=2;
4029                        } else {
4030                            /* change from single-byte mode to double-byte */
4031                            value|=(uint32_t)UCNV_SO<<16;
4032                            length=3;
4033                            prevLength=2;
4034                        }
4035                    }
4036                    break;
4037                case MBCS_OUTPUT_DBCS_ONLY:
4038                    /* table with single-byte results, but only DBCS mappings used */
4039                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4040                    if(value<=0xff) {
4041                        /* no mapping or SBCS result, not taken for DBCS-only */
4042                        goto unassigned;
4043                    } else {
4044                        length=2;
4045                    }
4046                    break;
4047                case MBCS_OUTPUT_3:
4048                    p=bytes+(value+(c&0x3f))*3;
4049                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4050                    if(value<=0xff) {
4051                        if(value==0) {
4052                            goto unassigned;
4053                        } else {
4054                            length=1;
4055                        }
4056                    } else if(value<=0xffff) {
4057                        length=2;
4058                    } else {
4059                        length=3;
4060                    }
4061                    break;
4062                case MBCS_OUTPUT_4:
4063                    value=((const uint32_t *)bytes)[value +(c&0x3f)];
4064                    if(value<=0xff) {
4065                        if(value==0) {
4066                            goto unassigned;
4067                        } else {
4068                            length=1;
4069                        }
4070                    } else if(value<=0xffff) {
4071                        length=2;
4072                    } else if(value<=0xffffff) {
4073                        length=3;
4074                    } else {
4075                        length=4;
4076                    }
4077                    break;
4078                case MBCS_OUTPUT_3_EUC:
4079                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4080                    /* EUC 16-bit fixed-length representation */
4081                    if(value<=0xff) {
4082                        if(value==0) {
4083                            goto unassigned;
4084                        } else {
4085                            length=1;
4086                        }
4087                    } else if((value&0x8000)==0) {
4088                        value|=0x8e8000;
4089                        length=3;
4090                    } else if((value&0x80)==0) {
4091                        value|=0x8f0080;
4092                        length=3;
4093                    } else {
4094                        length=2;
4095                    }
4096                    break;
4097                case MBCS_OUTPUT_4_EUC:
4098                    p=bytes+(value+(c&0x3f))*3;
4099                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4100                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
4101                    if(value<=0xff) {
4102                        if(value==0) {
4103                            goto unassigned;
4104                        } else {
4105                            length=1;
4106                        }
4107                    } else if(value<=0xffff) {
4108                        length=2;
4109                    } else if((value&0x800000)==0) {
4110                        value|=0x8e800000;
4111                        length=4;
4112                    } else if((value&0x8000)==0) {
4113                        value|=0x8f008000;
4114                        length=4;
4115                    } else {
4116                        length=3;
4117                    }
4118                    break;
4119                default:
4120                    /* must not occur */
4121                    /*
4122                     * To avoid compiler warnings that value & length may be
4123                     * used without having been initialized, we set them here.
4124                     * In reality, this is unreachable code.
4125                     * Not having a default branch also causes warnings with
4126                     * some compilers.
4127                     */
4128                    value=0;
4129                    length=0;
4130                    break;
4131                }
4132                /* output the value */
4133            } else {
4134                /*
4135                 * This also tests if the codepage maps single surrogates.
4136                 * If it does, then surrogates are not paired but mapped separately.
4137                 * Note that in this case unmatched surrogates are not detected.
4138                 */
4139                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4140                    if(UTF_IS_SURROGATE_FIRST(c)) {
4141getTrail:
4142                        if(source<sourceLimit) {
4143                            /* test the following code unit */
4144                            UChar trail=*source;
4145                            if(UTF_IS_SECOND_SURROGATE(trail)) {
4146                                ++source;
4147                                ++nextSourceIndex;
4148                                c=UTF16_GET_PAIR_VALUE(c, trail);
4149                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4150                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4151                                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4152                                    /* callback(unassigned) */
4153                                    goto unassigned;
4154                                }
4155                                /* convert this supplementary code point */
4156                                /* exit this condition tree */
4157                            } else {
4158                                /* this is an unmatched lead code unit (1st surrogate) */
4159                                /* callback(illegal) */
4160                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4161                                break;
4162                            }
4163                        } else {
4164                            /* no more input */
4165                            break;
4166                        }
4167                    } else {
4168                        /* this is an unmatched trail code unit (2nd surrogate) */
4169                        /* callback(illegal) */
4170                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4171                        break;
4172                    }
4173                }
4174
4175                /* convert the Unicode code point in c into codepage bytes */
4176
4177                /*
4178                 * The basic lookup is a triple-stage compact array (trie) lookup.
4179                 * For details see the beginning of this file.
4180                 *
4181                 * Single-byte codepages are handled with a different data structure
4182                 * by _MBCSSingle... functions.
4183                 *
4184                 * The result consists of a 32-bit value from stage 2 and
4185                 * a pointer to as many bytes as are stored per character.
4186                 * The pointer points to the character's bytes in stage 3.
4187                 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4188                 * for that pointer, while bits 31..16 are flags for which of
4189                 * the 16 characters in the block are roundtrip-assigned.
4190                 *
4191                 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4192                 * respectively as uint32_t, in the platform encoding.
4193                 * For 3-byte codepages, the bytes are always stored in big-endian order.
4194                 *
4195                 * For EUC encodings that use only either 0x8e or 0x8f as the first
4196                 * byte of their longest byte sequences, the first two bytes in
4197                 * this third stage indicate with their 7th bits whether these bytes
4198                 * are to be written directly or actually need to be preceeded by
4199                 * one of the two Single-Shift codes. With this, the third stage
4200                 * stores one byte fewer per character than the actual maximum length of
4201                 * EUC byte sequences.
4202                 *
4203                 * Other than that, leading zero bytes are removed and the other
4204                 * bytes output. A single zero byte may be output if the "assigned"
4205                 * bit in stage 2 was on.
4206                 * The data structure does not support zero byte output as a fallback,
4207                 * and also does not allow output of leading zeros.
4208                 */
4209                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4210
4211                /* get the bytes and the length for the output */
4212                switch(outputType) {
4213                case MBCS_OUTPUT_2:
4214                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4215                    if(value<=0xff) {
4216                        length=1;
4217                    } else {
4218                        length=2;
4219                    }
4220                    break;
4221                case MBCS_OUTPUT_2_SISO:
4222                    /* 1/2-byte stateful with Shift-In/Shift-Out */
4223                    /*
4224                     * Save the old state in the converter object
4225                     * right here, then change the local prevLength state variable if necessary.
4226                     * Then, if this character turns out to be unassigned or a fallback that
4227                     * is not taken, the callback code must not save the new state in the converter
4228                     * because the new state is for a character that is not output.
4229                     * However, the callback must still restore the state from the converter
4230                     * in case the callback function changed it for its output.
4231                     */
4232                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4233                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4234                    if(value<=0xff) {
4235                        if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4236                            /* no mapping, leave value==0 */
4237                            length=0;
4238                        } else if(prevLength<=1) {
4239                            length=1;
4240                        } else {
4241                            /* change from double-byte mode to single-byte */
4242                            value|=(uint32_t)UCNV_SI<<8;
4243                            length=2;
4244                            prevLength=1;
4245                        }
4246                    } else {
4247                        if(prevLength==2) {
4248                            length=2;
4249                        } else {
4250                            /* change from single-byte mode to double-byte */
4251                            value|=(uint32_t)UCNV_SO<<16;
4252                            length=3;
4253                            prevLength=2;
4254                        }
4255                    }
4256                    break;
4257                case MBCS_OUTPUT_DBCS_ONLY:
4258                    /* table with single-byte results, but only DBCS mappings used */
4259                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4260                    if(value<=0xff) {
4261                        /* no mapping or SBCS result, not taken for DBCS-only */
4262                        value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4263                        length=0;
4264                    } else {
4265                        length=2;
4266                    }
4267                    break;
4268                case MBCS_OUTPUT_3:
4269                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4270                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4271                    if(value<=0xff) {
4272                        length=1;
4273                    } else if(value<=0xffff) {
4274                        length=2;
4275                    } else {
4276                        length=3;
4277                    }
4278                    break;
4279                case MBCS_OUTPUT_4:
4280                    value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4281                    if(value<=0xff) {
4282                        length=1;
4283                    } else if(value<=0xffff) {
4284                        length=2;
4285                    } else if(value<=0xffffff) {
4286                        length=3;
4287                    } else {
4288                        length=4;
4289                    }
4290                    break;
4291                case MBCS_OUTPUT_3_EUC:
4292                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4293                    /* EUC 16-bit fixed-length representation */
4294                    if(value<=0xff) {
4295                        length=1;
4296                    } else if((value&0x8000)==0) {
4297                        value|=0x8e8000;
4298                        length=3;
4299                    } else if((value&0x80)==0) {
4300                        value|=0x8f0080;
4301                        length=3;
4302                    } else {
4303                        length=2;
4304                    }
4305                    break;
4306                case MBCS_OUTPUT_4_EUC:
4307                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4308                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4309                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
4310                    if(value<=0xff) {
4311                        length=1;
4312                    } else if(value<=0xffff) {
4313                        length=2;
4314                    } else if((value&0x800000)==0) {
4315                        value|=0x8e800000;
4316                        length=4;
4317                    } else if((value&0x8000)==0) {
4318                        value|=0x8f008000;
4319                        length=4;
4320                    } else {
4321                        length=3;
4322                    }
4323                    break;
4324                default:
4325                    /* must not occur */
4326                    /*
4327                     * To avoid compiler warnings that value & length may be
4328                     * used without having been initialized, we set them here.
4329                     * In reality, this is unreachable code.
4330                     * Not having a default branch also causes warnings with
4331                     * some compilers.
4332                     */
4333                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4334                    length=0;
4335                    break;
4336                }
4337
4338                /* is this code point assigned, or do we use fallbacks? */
4339                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4340                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4341                ) {
4342                    /*
4343                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
4344                     * There is no way with this data structure for fallback output
4345                     * to be a zero byte.
4346                     */
4347
4348unassigned:
4349                    /* try an extension mapping */
4350                    pArgs->source=source;
4351                    c=_extFromU(cnv, cnv->sharedData,
4352                                c, &source, sourceLimit,
4353                                &target, target+targetCapacity,
4354                                &offsets, sourceIndex,
4355                                pArgs->flush,
4356                                pErrorCode);
4357                    nextSourceIndex+=(int32_t)(source-pArgs->source);
4358                    prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4359
4360                    if(U_FAILURE(*pErrorCode)) {
4361                        /* not mappable or buffer overflow */
4362                        break;
4363                    } else {
4364                        /* a mapping was written to the target, continue */
4365
4366                        /* recalculate the targetCapacity after an extension mapping */
4367                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4368
4369                        /* normal end of conversion: prepare for a new character */
4370                        if(offsets!=NULL) {
4371                            prevSourceIndex=sourceIndex;
4372                            sourceIndex=nextSourceIndex;
4373                        }
4374                        continue;
4375                    }
4376                }
4377            }
4378
4379            /* write the output character bytes from value and length */
4380            /* from the first if in the loop we know that targetCapacity>0 */
4381            if(length<=targetCapacity) {
4382                if(offsets==NULL) {
4383                    switch(length) {
4384                        /* each branch falls through to the next one */
4385                    case 4:
4386                        *target++=(uint8_t)(value>>24);
4387                    case 3:
4388                        *target++=(uint8_t)(value>>16);
4389                    case 2:
4390                        *target++=(uint8_t)(value>>8);
4391                    case 1:
4392                        *target++=(uint8_t)value;
4393                    default:
4394                        /* will never occur */
4395                        break;
4396                    }
4397                } else {
4398                    switch(length) {
4399                        /* each branch falls through to the next one */
4400                    case 4:
4401                        *target++=(uint8_t)(value>>24);
4402                        *offsets++=sourceIndex;
4403                    case 3:
4404                        *target++=(uint8_t)(value>>16);
4405                        *offsets++=sourceIndex;
4406                    case 2:
4407                        *target++=(uint8_t)(value>>8);
4408                        *offsets++=sourceIndex;
4409                    case 1:
4410                        *target++=(uint8_t)value;
4411                        *offsets++=sourceIndex;
4412                    default:
4413                        /* will never occur */
4414                        break;
4415                    }
4416                }
4417                targetCapacity-=length;
4418            } else {
4419                uint8_t *charErrorBuffer;
4420
4421                /*
4422                 * We actually do this backwards here:
4423                 * In order to save an intermediate variable, we output
4424                 * first to the overflow buffer what does not fit into the
4425                 * regular target.
4426                 */
4427                /* we know that 1<=targetCapacity<length<=4 */
4428                length-=targetCapacity;
4429                charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4430                switch(length) {
4431                    /* each branch falls through to the next one */
4432                case 3:
4433                    *charErrorBuffer++=(uint8_t)(value>>16);
4434                case 2:
4435                    *charErrorBuffer++=(uint8_t)(value>>8);
4436                case 1:
4437                    *charErrorBuffer=(uint8_t)value;
4438                default:
4439                    /* will never occur */
4440                    break;
4441                }
4442                cnv->charErrorBufferLength=(int8_t)length;
4443
4444                /* now output what fits into the regular target */
4445                value>>=8*length; /* length was reduced by targetCapacity */
4446                switch(targetCapacity) {
4447                    /* each branch falls through to the next one */
4448                case 3:
4449                    *target++=(uint8_t)(value>>16);
4450                    if(offsets!=NULL) {
4451                        *offsets++=sourceIndex;
4452                    }
4453                case 2:
4454                    *target++=(uint8_t)(value>>8);
4455                    if(offsets!=NULL) {
4456                        *offsets++=sourceIndex;
4457                    }
4458                case 1:
4459                    *target++=(uint8_t)value;
4460                    if(offsets!=NULL) {
4461                        *offsets++=sourceIndex;
4462                    }
4463                default:
4464                    /* will never occur */
4465                    break;
4466                }
4467
4468                /* target overflow */
4469                targetCapacity=0;
4470                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4471                c=0;
4472                break;
4473            }
4474
4475            /* normal end of conversion: prepare for a new character */
4476            c=0;
4477            if(offsets!=NULL) {
4478                prevSourceIndex=sourceIndex;
4479                sourceIndex=nextSourceIndex;
4480            }
4481            continue;
4482        } else {
4483            /* target is full */
4484            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4485            break;
4486        }
4487    }
4488
4489    /*
4490     * the end of the input stream and detection of truncated input
4491     * are handled by the framework, but for EBCDIC_STATEFUL conversion
4492     * we need to emit an SI at the very end
4493     *
4494     * conditions:
4495     *   successful
4496     *   EBCDIC_STATEFUL in DBCS mode
4497     *   end of input and no truncated input
4498     */
4499    if( U_SUCCESS(*pErrorCode) &&
4500        outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4501        pArgs->flush && source>=sourceLimit && c==0
4502    ) {
4503        /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4504        if(targetCapacity>0) {
4505            *target++=(uint8_t)UCNV_SI;
4506            if(offsets!=NULL) {
4507                /* set the last source character's index (sourceIndex points at sourceLimit now) */
4508                *offsets++=prevSourceIndex;
4509            }
4510        } else {
4511            /* target is full */
4512            cnv->charErrorBuffer[0]=(char)UCNV_SI;
4513            cnv->charErrorBufferLength=1;
4514            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4515        }
4516        prevLength=1; /* we switched into SBCS */
4517    }
4518
4519    /* set the converter state back into UConverter */
4520    cnv->fromUChar32=c;
4521    cnv->fromUnicodeStatus=prevLength;
4522
4523    /* write back the updated pointers */
4524    pArgs->source=source;
4525    pArgs->target=(char *)target;
4526    pArgs->offsets=offsets;
4527}
4528
4529/*
4530 * This is another simple conversion function for internal use by other
4531 * conversion implementations.
4532 * It does not use the converter state nor call callbacks.
4533 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4534 * It handles conversion extensions but not GB 18030.
4535 *
4536 * It converts one single Unicode code point into codepage bytes, encoded
4537 * as one 32-bit value. The function returns the number of bytes in *pValue:
4538 * 1..4 the number of bytes in *pValue
4539 * 0    unassigned (*pValue undefined)
4540 * -1   illegal (currently not used, *pValue undefined)
4541 *
4542 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4543 * the second to last byte in bits 15..8, etc.
4544 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4545 */
4546U_CFUNC int32_t
4547ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
4548                 UChar32 c, uint32_t *pValue,
4549                 UBool useFallback) {
4550    const int32_t *cx;
4551    const uint16_t *table;
4552#if 0
4553/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4554    const uint8_t *p;
4555#endif
4556    uint32_t stage2Entry;
4557    uint32_t value;
4558    int32_t length;
4559
4560    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4561    if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4562        table=sharedData->mbcs.fromUnicodeTable;
4563
4564        /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4565        if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4566            value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4567            /* is this code point assigned, or do we use fallbacks? */
4568            if(useFallback ? value>=0x800 : value>=0xc00) {
4569                *pValue=value&0xff;
4570                return 1;
4571            }
4572        } else /* outputType!=MBCS_OUTPUT_1 */ {
4573            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4574
4575            /* get the bytes and the length for the output */
4576            switch(sharedData->mbcs.outputType) {
4577            case MBCS_OUTPUT_2:
4578                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4579                if(value<=0xff) {
4580                    length=1;
4581                } else {
4582                    length=2;
4583                }
4584                break;
4585#if 0
4586/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4587            case MBCS_OUTPUT_DBCS_ONLY:
4588                /* table with single-byte results, but only DBCS mappings used */
4589                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4590                if(value<=0xff) {
4591                    /* no mapping or SBCS result, not taken for DBCS-only */
4592                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4593                    length=0;
4594                } else {
4595                    length=2;
4596                }
4597                break;
4598            case MBCS_OUTPUT_3:
4599                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4600                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4601                if(value<=0xff) {
4602                    length=1;
4603                } else if(value<=0xffff) {
4604                    length=2;
4605                } else {
4606                    length=3;
4607                }
4608                break;
4609            case MBCS_OUTPUT_4:
4610                value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4611                if(value<=0xff) {
4612                    length=1;
4613                } else if(value<=0xffff) {
4614                    length=2;
4615                } else if(value<=0xffffff) {
4616                    length=3;
4617                } else {
4618                    length=4;
4619                }
4620                break;
4621            case MBCS_OUTPUT_3_EUC:
4622                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4623                /* EUC 16-bit fixed-length representation */
4624                if(value<=0xff) {
4625                    length=1;
4626                } else if((value&0x8000)==0) {
4627                    value|=0x8e8000;
4628                    length=3;
4629                } else if((value&0x80)==0) {
4630                    value|=0x8f0080;
4631                    length=3;
4632                } else {
4633                    length=2;
4634                }
4635                break;
4636            case MBCS_OUTPUT_4_EUC:
4637                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4638                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4639                /* EUC 16-bit fixed-length representation applied to the first two bytes */
4640                if(value<=0xff) {
4641                    length=1;
4642                } else if(value<=0xffff) {
4643                    length=2;
4644                } else if((value&0x800000)==0) {
4645                    value|=0x8e800000;
4646                    length=4;
4647                } else if((value&0x8000)==0) {
4648                    value|=0x8f008000;
4649                    length=4;
4650                } else {
4651                    length=3;
4652                }
4653                break;
4654#endif
4655            default:
4656                /* must not occur */
4657                return -1;
4658            }
4659
4660            /* is this code point assigned, or do we use fallbacks? */
4661            if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4662                (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4663            ) {
4664                /*
4665                 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4666                 * There is no way with this data structure for fallback output
4667                 * to be a zero byte.
4668                 */
4669                /* assigned */
4670                *pValue=value;
4671                return length;
4672            }
4673        }
4674    }
4675
4676    cx=sharedData->mbcs.extIndexes;
4677    if(cx!=NULL) {
4678        length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4679        return length>=0 ? length : -length;  /* return abs(length); */
4680    }
4681
4682    /* unassigned */
4683    return 0;
4684}
4685
4686
4687#if 0
4688/*
4689 * This function has been moved to ucnv2022.c for inlining.
4690 * This implementation is here only for documentation purposes
4691 */
4692
4693/**
4694 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4695 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4696 * It does not handle conversion extensions (_extFromU()).
4697 *
4698 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4699 */
4700U_CFUNC int32_t
4701ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
4702                       UChar32 c,
4703                       UBool useFallback) {
4704    const uint16_t *table;
4705    int32_t value;
4706
4707    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4708    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4709        return -1;
4710    }
4711
4712    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4713    table=sharedData->mbcs.fromUnicodeTable;
4714
4715    /* get the byte for the output */
4716    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4717    /* is this code point assigned, or do we use fallbacks? */
4718    if(useFallback ? value>=0x800 : value>=0xc00) {
4719        return value&0xff;
4720    } else {
4721        return -1;
4722    }
4723}
4724#endif
4725
4726/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4727
4728/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4729static const UChar32
4730utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4731
4732/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4733static const UChar32
4734utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4735
4736static void
4737ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4738                  UConverterToUnicodeArgs *pToUArgs,
4739                  UErrorCode *pErrorCode) {
4740    UConverter *utf8, *cnv;
4741    const uint8_t *source, *sourceLimit;
4742    uint8_t *target;
4743    int32_t targetCapacity;
4744
4745    const uint16_t *table, *sbcsIndex;
4746    const uint16_t *results;
4747
4748    int8_t oldToULength, toULength, toULimit;
4749
4750    UChar32 c;
4751    uint8_t b, t1, t2;
4752
4753    uint32_t asciiRoundtrips;
4754    uint16_t value, minValue;
4755    UBool hasSupplementary;
4756
4757    /* set up the local pointers */
4758    utf8=pToUArgs->converter;
4759    cnv=pFromUArgs->converter;
4760    source=(uint8_t *)pToUArgs->source;
4761    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4762    target=(uint8_t *)pFromUArgs->target;
4763    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4764
4765    table=cnv->sharedData->mbcs.fromUnicodeTable;
4766    sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
4767    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4768        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4769    } else {
4770        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4771    }
4772    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4773
4774    if(cnv->useFallback) {
4775        /* use all roundtrip and fallback results */
4776        minValue=0x800;
4777    } else {
4778        /* use only roundtrips and fallbacks from private-use characters */
4779        minValue=0xc00;
4780    }
4781    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4782
4783    /* get the converter state from the UTF-8 UConverter */
4784    c=(UChar32)utf8->toUnicodeStatus;
4785    if(c!=0) {
4786        toULength=oldToULength=utf8->toULength;
4787        toULimit=(int8_t)utf8->mode;
4788    } else {
4789        toULength=oldToULength=toULimit=0;
4790    }
4791
4792    /*
4793     * Make sure that the last byte sequence before sourceLimit is complete
4794     * or runs into a lead byte.
4795     * Do not go back into the bytes that will be read for finishing a partial
4796     * sequence from the previous buffer.
4797     * In the conversion loop compare source with sourceLimit only once
4798     * per multi-byte character.
4799     */
4800    {
4801        int32_t i, length;
4802
4803        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4804        for(i=0; i<3 && i<length;) {
4805            b=*(sourceLimit-i-1);
4806            if(U8_IS_TRAIL(b)) {
4807                ++i;
4808            } else {
4809                if(i<utf8_countTrailBytes[b]) {
4810                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4811                    sourceLimit-=i+1;
4812                }
4813                break;
4814            }
4815        }
4816    }
4817
4818    if(c!=0 && targetCapacity>0) {
4819        utf8->toUnicodeStatus=0;
4820        utf8->toULength=0;
4821        goto moreBytes;
4822        /*
4823         * Note: We could avoid the goto by duplicating some of the moreBytes
4824         * code, but only up to the point of collecting a complete UTF-8
4825         * sequence; then recurse for the toUBytes[toULength]
4826         * and then continue with normal conversion.
4827         *
4828         * If so, move this code to just after initializing the minimum
4829         * set of local variables for reading the UTF-8 input
4830         * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4831         *
4832         * Potential advantages:
4833         * - avoid the goto
4834         * - oldToULength could become a local variable in just those code blocks
4835         *   that deal with buffer boundaries
4836         * - possibly faster if the goto prevents some compiler optimizations
4837         *   (this would need measuring to confirm)
4838         * Disadvantage:
4839         * - code duplication
4840         */
4841    }
4842
4843    /* conversion loop */
4844    while(source<sourceLimit) {
4845        if(targetCapacity>0) {
4846            b=*source++;
4847            if((int8_t)b>=0) {
4848                /* convert ASCII */
4849                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4850                    *target++=(uint8_t)b;
4851                    --targetCapacity;
4852                    continue;
4853                } else {
4854                    c=b;
4855                    value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
4856                }
4857            } else {
4858                if(b<0xe0) {
4859                    if( /* handle U+0080..U+07FF inline */
4860                        b>=0xc2 &&
4861                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
4862                    ) {
4863                        c=b&0x1f;
4864                        ++source;
4865                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
4866                        if(value>=minValue) {
4867                            *target++=(uint8_t)value;
4868                            --targetCapacity;
4869                            continue;
4870                        } else {
4871                            c=(c<<6)|t1;
4872                        }
4873                    } else {
4874                        c=-1;
4875                    }
4876                } else if(b==0xe0) {
4877                    if( /* handle U+0800..U+0FFF inline */
4878                        (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
4879                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
4880                    ) {
4881                        c=t1;
4882                        source+=2;
4883                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
4884                        if(value>=minValue) {
4885                            *target++=(uint8_t)value;
4886                            --targetCapacity;
4887                            continue;
4888                        } else {
4889                            c=(c<<6)|t2;
4890                        }
4891                    } else {
4892                        c=-1;
4893                    }
4894                } else {
4895                    c=-1;
4896                }
4897
4898                if(c<0) {
4899                    /* handle "complicated" and error cases, and continuing partial characters */
4900                    oldToULength=0;
4901                    toULength=1;
4902                    toULimit=utf8_countTrailBytes[b]+1;
4903                    c=b;
4904moreBytes:
4905                    while(toULength<toULimit) {
4906                        if(source<sourceLimit) {
4907                            b=*source;
4908                            if(U8_IS_TRAIL(b)) {
4909                                ++source;
4910                                ++toULength;
4911                                c=(c<<6)+b;
4912                            } else {
4913                                break; /* sequence too short, stop with toULength<toULimit */
4914                            }
4915                        } else {
4916                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4917                            source-=(toULength-oldToULength);
4918                            while(oldToULength<toULength) {
4919                                utf8->toUBytes[oldToULength++]=*source++;
4920                            }
4921                            utf8->toUnicodeStatus=c;
4922                            utf8->toULength=toULength;
4923                            utf8->mode=toULimit;
4924                            pToUArgs->source=(char *)source;
4925                            pFromUArgs->target=(char *)target;
4926                            return;
4927                        }
4928                    }
4929
4930                    if( toULength==toULimit &&      /* consumed all trail bytes */
4931                        (toULength==3 || toULength==2) &&             /* BMP */
4932                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
4933                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
4934                    ) {
4935                        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4936                    } else if(
4937                        toULength==toULimit && toULength==4 &&
4938                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
4939                    ) {
4940                        /* supplementary code point */
4941                        if(!hasSupplementary) {
4942                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4943                            value=0;
4944                        } else {
4945                            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4946                        }
4947                    } else {
4948                        /* error handling: illegal UTF-8 byte sequence */
4949                        source-=(toULength-oldToULength);
4950                        while(oldToULength<toULength) {
4951                            utf8->toUBytes[oldToULength++]=*source++;
4952                        }
4953                        utf8->toULength=toULength;
4954                        pToUArgs->source=(char *)source;
4955                        pFromUArgs->target=(char *)target;
4956                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4957                        return;
4958                    }
4959                }
4960            }
4961
4962            if(value>=minValue) {
4963                /* output the mapping for c */
4964                *target++=(uint8_t)value;
4965                --targetCapacity;
4966            } else {
4967                /* value<minValue means c is unassigned (unmappable) */
4968                /*
4969                 * Try an extension mapping.
4970                 * Pass in no source because we don't have UTF-16 input.
4971                 * If we have a partial match on c, we will return and revert
4972                 * to UTF-8->UTF-16->charset conversion.
4973                 */
4974                static const UChar nul=0;
4975                const UChar *noSource=&nul;
4976                c=_extFromU(cnv, cnv->sharedData,
4977                            c, &noSource, noSource,
4978                            &target, target+targetCapacity,
4979                            NULL, -1,
4980                            pFromUArgs->flush,
4981                            pErrorCode);
4982
4983                if(U_FAILURE(*pErrorCode)) {
4984                    /* not mappable or buffer overflow */
4985                    cnv->fromUChar32=c;
4986                    break;
4987                } else if(cnv->preFromUFirstCP>=0) {
4988                    /*
4989                     * Partial match, return and revert to pivoting.
4990                     * In normal from-UTF-16 conversion, we would just continue
4991                     * but then exit the loop because the extension match would
4992                     * have consumed the source.
4993                     */
4994                    break;
4995                } else {
4996                    /* a mapping was written to the target, continue */
4997
4998                    /* recalculate the targetCapacity after an extension mapping */
4999                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5000                }
5001            }
5002        } else {
5003            /* target is full */
5004            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5005            break;
5006        }
5007    }
5008
5009    /*
5010     * The sourceLimit may have been adjusted before the conversion loop
5011     * to stop before a truncated sequence.
5012     * If so, then collect the truncated sequence now.
5013     */
5014    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5015        c=utf8->toUBytes[0]=b=*source++;
5016        toULength=1;
5017        toULimit=utf8_countTrailBytes[b]+1;
5018        while(source<sourceLimit) {
5019            utf8->toUBytes[toULength++]=b=*source++;
5020            c=(c<<6)+b;
5021        }
5022        utf8->toUnicodeStatus=c;
5023        utf8->toULength=toULength;
5024        utf8->mode=toULimit;
5025    }
5026
5027    /* write back the updated pointers */
5028    pToUArgs->source=(char *)source;
5029    pFromUArgs->target=(char *)target;
5030}
5031
5032static void
5033ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5034                  UConverterToUnicodeArgs *pToUArgs,
5035                  UErrorCode *pErrorCode) {
5036    UConverter *utf8, *cnv;
5037    const uint8_t *source, *sourceLimit;
5038    uint8_t *target;
5039    int32_t targetCapacity;
5040
5041    const uint16_t *table, *mbcsIndex;
5042    const uint16_t *results;
5043
5044    int8_t oldToULength, toULength, toULimit;
5045
5046    UChar32 c;
5047    uint8_t b, t1, t2;
5048
5049    uint32_t stage2Entry;
5050    uint32_t asciiRoundtrips;
5051    uint16_t value, minValue;
5052    UBool hasSupplementary;
5053
5054    /* set up the local pointers */
5055    utf8=pToUArgs->converter;
5056    cnv=pFromUArgs->converter;
5057    source=(uint8_t *)pToUArgs->source;
5058    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5059    target=(uint8_t *)pFromUArgs->target;
5060    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5061
5062    table=cnv->sharedData->mbcs.fromUnicodeTable;
5063    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5064    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5065        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5066    } else {
5067        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5068    }
5069    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5070
5071    if(cnv->useFallback) {
5072        /* use all roundtrip and fallback results */
5073        minValue=0x800;
5074    } else {
5075        /* use only roundtrips and fallbacks from private-use characters */
5076        minValue=0xc00;
5077    }
5078    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5079
5080    /* get the converter state from the UTF-8 UConverter */
5081    c=(UChar32)utf8->toUnicodeStatus;
5082    if(c!=0) {
5083        toULength=oldToULength=utf8->toULength;
5084        toULimit=(int8_t)utf8->mode;
5085    } else {
5086        toULength=oldToULength=toULimit=0;
5087    }
5088
5089    /*
5090     * Make sure that the last byte sequence before sourceLimit is complete
5091     * or runs into a lead byte.
5092     * Do not go back into the bytes that will be read for finishing a partial
5093     * sequence from the previous buffer.
5094     * In the conversion loop compare source with sourceLimit only once
5095     * per multi-byte character.
5096     */
5097    {
5098        int32_t i, length;
5099
5100        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5101        for(i=0; i<3 && i<length;) {
5102            b=*(sourceLimit-i-1);
5103            if(U8_IS_TRAIL(b)) {
5104                ++i;
5105            } else {
5106                if(i<utf8_countTrailBytes[b]) {
5107                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5108                    sourceLimit-=i+1;
5109                }
5110                break;
5111            }
5112        }
5113    }
5114
5115    if(c!=0 && targetCapacity>0) {
5116        utf8->toUnicodeStatus=0;
5117        utf8->toULength=0;
5118        goto moreBytes;
5119        /* See note in ucnv_SBCSFromUTF8() about this goto. */
5120    }
5121
5122    /* conversion loop */
5123    while(source<sourceLimit) {
5124        if(targetCapacity>0) {
5125            b=*source++;
5126            if((int8_t)b>=0) {
5127                /* convert ASCII */
5128                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5129                    *target++=b;
5130                    --targetCapacity;
5131                    continue;
5132                } else {
5133                    value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
5134                    if(value==0) {
5135                        c=b;
5136                        goto unassigned;
5137                    }
5138                }
5139            } else {
5140                if(b>0xe0) {
5141                    if( /* handle U+1000..U+D7FF inline */
5142                        (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
5143                                                        (b==0xed && (t1 <= 0x1f))) &&
5144                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5145                    ) {
5146                        c=((b&0xf)<<6)|t1;
5147                        source+=2;
5148                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5149                        if(value==0) {
5150                            c=(c<<6)|t2;
5151                            goto unassigned;
5152                        }
5153                    } else {
5154                        c=-1;
5155                    }
5156                } else if(b<0xe0) {
5157                    if( /* handle U+0080..U+07FF inline */
5158                        b>=0xc2 &&
5159                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
5160                    ) {
5161                        c=b&0x1f;
5162                        ++source;
5163                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5164                        if(value==0) {
5165                            c=(c<<6)|t1;
5166                            goto unassigned;
5167                        }
5168                    } else {
5169                        c=-1;
5170                    }
5171                } else {
5172                    c=-1;
5173                }
5174
5175                if(c<0) {
5176                    /* handle "complicated" and error cases, and continuing partial characters */
5177                    oldToULength=0;
5178                    toULength=1;
5179                    toULimit=utf8_countTrailBytes[b]+1;
5180                    c=b;
5181moreBytes:
5182                    while(toULength<toULimit) {
5183                        if(source<sourceLimit) {
5184                            b=*source;
5185                            if(U8_IS_TRAIL(b)) {
5186                                ++source;
5187                                ++toULength;
5188                                c=(c<<6)+b;
5189                            } else {
5190                                break; /* sequence too short, stop with toULength<toULimit */
5191                            }
5192                        } else {
5193                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5194                            source-=(toULength-oldToULength);
5195                            while(oldToULength<toULength) {
5196                                utf8->toUBytes[oldToULength++]=*source++;
5197                            }
5198                            utf8->toUnicodeStatus=c;
5199                            utf8->toULength=toULength;
5200                            utf8->mode=toULimit;
5201                            pToUArgs->source=(char *)source;
5202                            pFromUArgs->target=(char *)target;
5203                            return;
5204                        }
5205                    }
5206
5207                    if( toULength==toULimit &&      /* consumed all trail bytes */
5208                        (toULength==3 || toULength==2) &&             /* BMP */
5209                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5210                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
5211                    ) {
5212                        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5213                    } else if(
5214                        toULength==toULimit && toULength==4 &&
5215                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5216                    ) {
5217                        /* supplementary code point */
5218                        if(!hasSupplementary) {
5219                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5220                            stage2Entry=0;
5221                        } else {
5222                            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5223                        }
5224                    } else {
5225                        /* error handling: illegal UTF-8 byte sequence */
5226                        source-=(toULength-oldToULength);
5227                        while(oldToULength<toULength) {
5228                            utf8->toUBytes[oldToULength++]=*source++;
5229                        }
5230                        utf8->toULength=toULength;
5231                        pToUArgs->source=(char *)source;
5232                        pFromUArgs->target=(char *)target;
5233                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5234                        return;
5235                    }
5236
5237                    /* get the bytes and the length for the output */
5238                    /* MBCS_OUTPUT_2 */
5239                    value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5240
5241                    /* is this code point assigned, or do we use fallbacks? */
5242                    if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
5243                         (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
5244                    ) {
5245                        goto unassigned;
5246                    }
5247                }
5248            }
5249
5250            /* write the output character bytes from value and length */
5251            /* from the first if in the loop we know that targetCapacity>0 */
5252            if(value<=0xff) {
5253                /* this is easy because we know that there is enough space */
5254                *target++=(uint8_t)value;
5255                --targetCapacity;
5256            } else /* length==2 */ {
5257                *target++=(uint8_t)(value>>8);
5258                if(2<=targetCapacity) {
5259                    *target++=(uint8_t)value;
5260                    targetCapacity-=2;
5261                } else {
5262                    cnv->charErrorBuffer[0]=(char)value;
5263                    cnv->charErrorBufferLength=1;
5264
5265                    /* target overflow */
5266                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5267                    break;
5268                }
5269            }
5270            continue;
5271
5272unassigned:
5273            {
5274                /*
5275                 * Try an extension mapping.
5276                 * Pass in no source because we don't have UTF-16 input.
5277                 * If we have a partial match on c, we will return and revert
5278                 * to UTF-8->UTF-16->charset conversion.
5279                 */
5280                static const UChar nul=0;
5281                const UChar *noSource=&nul;
5282                c=_extFromU(cnv, cnv->sharedData,
5283                            c, &noSource, noSource,
5284                            &target, target+targetCapacity,
5285                            NULL, -1,
5286                            pFromUArgs->flush,
5287                            pErrorCode);
5288
5289                if(U_FAILURE(*pErrorCode)) {
5290                    /* not mappable or buffer overflow */
5291                    cnv->fromUChar32=c;
5292                    break;
5293                } else if(cnv->preFromUFirstCP>=0) {
5294                    /*
5295                     * Partial match, return and revert to pivoting.
5296                     * In normal from-UTF-16 conversion, we would just continue
5297                     * but then exit the loop because the extension match would
5298                     * have consumed the source.
5299                     */
5300                    break;
5301                } else {
5302                    /* a mapping was written to the target, continue */
5303
5304                    /* recalculate the targetCapacity after an extension mapping */
5305                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5306                    continue;
5307                }
5308            }
5309        } else {
5310            /* target is full */
5311            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5312            break;
5313        }
5314    }
5315
5316    /*
5317     * The sourceLimit may have been adjusted before the conversion loop
5318     * to stop before a truncated sequence.
5319     * If so, then collect the truncated sequence now.
5320     */
5321    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5322        c=utf8->toUBytes[0]=b=*source++;
5323        toULength=1;
5324        toULimit=utf8_countTrailBytes[b]+1;
5325        while(source<sourceLimit) {
5326            utf8->toUBytes[toULength++]=b=*source++;
5327            c=(c<<6)+b;
5328        }
5329        utf8->toUnicodeStatus=c;
5330        utf8->toULength=toULength;
5331        utf8->mode=toULimit;
5332    }
5333
5334    /* write back the updated pointers */
5335    pToUArgs->source=(char *)source;
5336    pFromUArgs->target=(char *)target;
5337}
5338
5339/* miscellaneous ------------------------------------------------------------ */
5340
5341static void
5342ucnv_MBCSGetStarters(const UConverter* cnv,
5343                 UBool starters[256],
5344                 UErrorCode *pErrorCode) {
5345    const int32_t *state0;
5346    int i;
5347
5348    state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
5349    for(i=0; i<256; ++i) {
5350        /* all bytes that cause a state transition from state 0 are lead bytes */
5351        starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5352    }
5353}
5354
5355/*
5356 * This is an internal function that allows other converter implementations
5357 * to check whether a byte is a lead byte.
5358 */
5359U_CFUNC UBool
5360ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
5361    return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
5362}
5363
5364static void
5365ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
5366              int32_t offsetIndex,
5367              UErrorCode *pErrorCode) {
5368    UConverter *cnv=pArgs->converter;
5369    char *p, *subchar;
5370    char buffer[4];
5371    int32_t length;
5372
5373    /* first, select between subChar and subChar1 */
5374    if( cnv->subChar1!=0 &&
5375        (cnv->sharedData->mbcs.extIndexes!=NULL ?
5376            cnv->useSubChar1 :
5377            (cnv->invalidUCharBuffer[0]<=0xff))
5378    ) {
5379        /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5380        subchar=(char *)&cnv->subChar1;
5381        length=1;
5382    } else {
5383        /* select subChar in all other cases */
5384        subchar=(char *)cnv->subChars;
5385        length=cnv->subCharLen;
5386    }
5387
5388    /* reset the selector for the next code point */
5389    cnv->useSubChar1=FALSE;
5390
5391    if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
5392        p=buffer;
5393
5394        /* fromUnicodeStatus contains prevLength */
5395        switch(length) {
5396        case 1:
5397            if(cnv->fromUnicodeStatus==2) {
5398                /* DBCS mode and SBCS sub char: change to SBCS */
5399                cnv->fromUnicodeStatus=1;
5400                *p++=UCNV_SI;
5401            }
5402            *p++=subchar[0];
5403            break;
5404        case 2:
5405            if(cnv->fromUnicodeStatus<=1) {
5406                /* SBCS mode and DBCS sub char: change to DBCS */
5407                cnv->fromUnicodeStatus=2;
5408                *p++=UCNV_SO;
5409            }
5410            *p++=subchar[0];
5411            *p++=subchar[1];
5412            break;
5413        default:
5414            *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5415            return;
5416        }
5417        subchar=buffer;
5418        length=(int32_t)(p-buffer);
5419    }
5420
5421    ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
5422}
5423
5424U_CFUNC UConverterType
5425ucnv_MBCSGetType(const UConverter* converter) {
5426    /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
5427    if(converter->sharedData->mbcs.countStates==1) {
5428        return (UConverterType)UCNV_SBCS;
5429    } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
5430        return (UConverterType)UCNV_EBCDIC_STATEFUL;
5431    } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
5432        return (UConverterType)UCNV_DBCS;
5433    }
5434    return (UConverterType)UCNV_MBCS;
5435}
5436
5437static const UConverterImpl _SBCSUTF8Impl={
5438    UCNV_MBCS,
5439
5440    ucnv_MBCSLoad,
5441    ucnv_MBCSUnload,
5442
5443    ucnv_MBCSOpen,
5444    NULL,
5445    NULL,
5446
5447    ucnv_MBCSToUnicodeWithOffsets,
5448    ucnv_MBCSToUnicodeWithOffsets,
5449    ucnv_MBCSFromUnicodeWithOffsets,
5450    ucnv_MBCSFromUnicodeWithOffsets,
5451    ucnv_MBCSGetNextUChar,
5452
5453    ucnv_MBCSGetStarters,
5454    ucnv_MBCSGetName,
5455    ucnv_MBCSWriteSub,
5456    NULL,
5457    ucnv_MBCSGetUnicodeSet,
5458
5459    NULL,
5460    ucnv_SBCSFromUTF8
5461};
5462
5463static const UConverterImpl _DBCSUTF8Impl={
5464    UCNV_MBCS,
5465
5466    ucnv_MBCSLoad,
5467    ucnv_MBCSUnload,
5468
5469    ucnv_MBCSOpen,
5470    NULL,
5471    NULL,
5472
5473    ucnv_MBCSToUnicodeWithOffsets,
5474    ucnv_MBCSToUnicodeWithOffsets,
5475    ucnv_MBCSFromUnicodeWithOffsets,
5476    ucnv_MBCSFromUnicodeWithOffsets,
5477    ucnv_MBCSGetNextUChar,
5478
5479    ucnv_MBCSGetStarters,
5480    ucnv_MBCSGetName,
5481    ucnv_MBCSWriteSub,
5482    NULL,
5483    ucnv_MBCSGetUnicodeSet,
5484
5485    NULL,
5486    ucnv_DBCSFromUTF8
5487};
5488
5489static const UConverterImpl _MBCSImpl={
5490    UCNV_MBCS,
5491
5492    ucnv_MBCSLoad,
5493    ucnv_MBCSUnload,
5494
5495    ucnv_MBCSOpen,
5496    NULL,
5497    NULL,
5498
5499    ucnv_MBCSToUnicodeWithOffsets,
5500    ucnv_MBCSToUnicodeWithOffsets,
5501    ucnv_MBCSFromUnicodeWithOffsets,
5502    ucnv_MBCSFromUnicodeWithOffsets,
5503    ucnv_MBCSGetNextUChar,
5504
5505    ucnv_MBCSGetStarters,
5506    ucnv_MBCSGetName,
5507    ucnv_MBCSWriteSub,
5508    NULL,
5509    ucnv_MBCSGetUnicodeSet
5510};
5511
5512
5513/* Static data is in tools/makeconv/ucnvstat.c for data-based
5514 * converters. Be sure to update it as well.
5515 */
5516
5517const UConverterSharedData _MBCSData={
5518    sizeof(UConverterSharedData), 1,
5519    NULL, NULL, NULL, FALSE, &_MBCSImpl,
5520    0
5521};
5522
5523#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
5524