ucnvmbcs.c revision b13da9df870a61b11249bf741347908dbea0edd8
1/*
2******************************************************************************
3*
4*   Copyright (C) 2000-2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ucnvmbcs.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000jul03
14*   created by: Markus W. Scherer
15*
16*   The current code in this file replaces the previous implementation
17*   of conversion code from multi-byte codepages to Unicode and back.
18*   This implementation supports the following:
19*   - legacy variable-length codepages with up to 4 bytes per character
20*   - all Unicode code points (up to 0x10ffff)
21*   - efficient distinction of unassigned vs. illegal byte sequences
22*   - it is possible in fromUnicode() to directly deal with simple
23*     stateful encodings (used for EBCDIC_STATEFUL)
24*   - it is possible to convert Unicode code points
25*     to a single zero byte (but not as a fallback except for SBCS)
26*
27*   Remaining limitations in fromUnicode:
28*   - byte sequences must not have leading zero bytes
29*   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30*   - limitation to up to 4 bytes per character
31*
32*   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33*   limitations and adds m:n character mappings and other features.
34*   See ucnv_ext.h for details.
35*
36*   Change history:
37*
38*    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39*                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40*                             macros to ucnvmbcs.h file
41*/
42
43#include "unicode/utypes.h"
44
45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
46
47#include "unicode/ucnv.h"
48#include "unicode/ucnv_cb.h"
49#include "unicode/udata.h"
50#include "unicode/uset.h"
51#include "ucnv_bld.h"
52#include "ucnvmbcs.h"
53#include "ucnv_ext.h"
54#include "ucnv_cnv.h"
55#include "umutex.h"
56#include "cmemory.h"
57#include "cstring.h"
58
59/* control optimizations according to the platform */
60#define MBCS_UNROLL_SINGLE_TO_BMP 1
61#define MBCS_UNROLL_SINGLE_FROM_BMP 0
62
63/*
64 * _MBCSHeader version 4.3
65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
66 *
67 * Change from version 4.2:
68 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
69 *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
70 *   files which can be used instead of stages 1 & 2.
71 *   Faster lookups for roundtrips from most commonly used characters,
72 *   and lookups from UTF-8 byte sequences with a natural bit distribution.
73 *   See ucnvmbcs.h for more details.
74 *
75 * Change from version 4.1:
76 * - Added an optional extension table structure at the end of the .cnv file.
77 *   It is present if the upper bits of the header flags field contains a non-zero
78 *   byte offset to it.
79 *   Files that contain only a conversion table and no base table
80 *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
81 *   These contain the base table name between the MBCS header and the extension
82 *   data.
83 *
84 * Change from version 4.0:
85 * - Replace header.reserved with header.fromUBytesLength so that all
86 *   fields in the data have length.
87 *
88 * Changes from version 3 (for performance improvements):
89 * - new bit distribution for state table entries
90 * - reordered action codes
91 * - new data structure for single-byte fromUnicode
92 *   + stage 2 only contains indexes
93 *   + stage 3 stores 16 bits per character with classification bits 15..8
94 * - no multiplier for stage 1 entries
95 * - stage 2 for non-single-byte codepages contains the index and the flags in
96 *   one 32-bit value
97 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
98 *
99 * For more details about old versions of the MBCS data structure, see
100 * the corresponding versions of this file.
101 *
102 * Converting stateless codepage data ---------------------------------------***
103 * (or codepage data with simple states) to Unicode.
104 *
105 * Data structure and algorithm for converting from complex legacy codepages
106 * to Unicode. (Designed before 2000-may-22.)
107 *
108 * The basic idea is that the structure of legacy codepages can be described
109 * with state tables.
110 * When reading a byte stream, each input byte causes a state transition.
111 * Some transitions result in the output of a code point, some result in
112 * "unassigned" or "illegal" output.
113 * This is used here for character conversion.
114 *
115 * The data structure begins with a state table consisting of a row
116 * per state, with 256 entries (columns) per row for each possible input
117 * byte value.
118 * Each entry is 32 bits wide, with two formats distinguished by
119 * the sign bit (bit 31):
120 *
121 * One format for transitional entries (bit 31 not set) for non-final bytes, and
122 * one format for final entries (bit 31 set).
123 * Both formats contain the number of the next state in the same bit
124 * positions.
125 * State 0 is the initial state.
126 *
127 * Most of the time, the offset values of subsequent states are added
128 * up to a scalar value. This value will eventually be the index of
129 * the Unicode code point in a table that follows the state table.
130 * The effect is that the code points for final state table rows
131 * are contiguous. The code points of final state rows follow each other
132 * in the order of the references to those final states by previous
133 * states, etc.
134 *
135 * For some terminal states, the offset is itself the output Unicode
136 * code point (16 bits for a BMP code point or 20 bits for a supplementary
137 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
138 * For others, the code point in the Unicode table is stored with either
139 * one or two code units: one for BMP code points, two for a pair of
140 * surrogates.
141 * All code points for a final state entry take up the same number of code
142 * units, regardless of whether they all actually _use_ the same number
143 * of code units. This is necessary for simple array access.
144 *
145 * An additional feature comes in with what in ICU is called "fallback"
146 * mappings:
147 *
148 * In addition to round-trippable, precise, 1:1 mappings, there are often
149 * mappings defined between similar, though not the same, characters.
150 * Typically, such mappings occur only in fromUnicode mapping tables because
151 * Unicode has a superset repertoire of most other codepages. However, it
152 * is possible to provide such mappings in the toUnicode tables, too.
153 * In this case, the fallback mappings are partly integrated into the
154 * general state tables because the structure of the encoding includes their
155 * byte sequences.
156 * For final entries in an initial state, fallback mappings are stored in
157 * the entry itself like with roundtrip mappings.
158 * For other final entries, they are stored in the code units table if
159 * the entry is for a pair of code units.
160 * For single-unit results in the code units table, there is no space to
161 * alternatively hold a fallback mapping; in this case, the code unit
162 * is stored as U+fffe (unassigned), and the fallback mapping needs to
163 * be looked up by the scalar offset value in a separate table.
164 *
165 * "Unassigned" state entries really mean "structurally unassigned",
166 * i.e., such a byte sequence will never have a mapping result.
167 *
168 * The interpretation of the bits in each entry is as follows:
169 *
170 * Bit 31 not set, not a terminal entry ("transitional"):
171 * 30..24 next state
172 * 23..0  offset delta, to be added up
173 *
174 * Bit 31 set, terminal ("final") entry:
175 * 30..24 next state (regardless of action code)
176 * 23..20 action code:
177 *        action codes 0 and 1 result in precise-mapping Unicode code points
178 *        0  valid byte sequence
179 *           19..16 not used, 0
180 *           15..0  16-bit Unicode BMP code point
181 *                  never U+fffe or U+ffff
182 *        1  valid byte sequence
183 *           19..0  20-bit Unicode supplementary code point
184 *                  never U+fffe or U+ffff
185 *
186 *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
187 *        2  valid byte sequence (fallback)
188 *           19..16 not used, 0
189 *           15..0  16-bit Unicode BMP code point as fallback result
190 *        3  valid byte sequence (fallback)
191 *           19..0  20-bit Unicode supplementary code point as fallback result
192 *
193 *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
194 *        depending on the code units they result in
195 *        4  valid byte sequence
196 *           19..9  not used, 0
197 *            8..0  final offset delta
198 *                  pointing to one 16-bit code unit which may be
199 *                  fffe  unassigned -- look for a fallback for this offset
200 *                  ffff  illegal
201 *        5  valid byte sequence
202 *           19..9  not used, 0
203 *            8..0  final offset delta
204 *                  pointing to two 16-bit code units
205 *                  (typically UTF-16 surrogates)
206 *                  the result depends on the first code unit as follows:
207 *                  0000..d7ff  roundtrip BMP code point (1st alone)
208 *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
209 *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
210 *                  e000        roundtrip BMP code point (2nd alone)
211 *                  e001        fallback BMP code point (2nd alone)
212 *                  fffe        unassigned
213 *                  ffff        illegal
214 *           (the final offset deltas are at most 255 * 2,
215 *            times 2 because of storing code unit pairs)
216 *
217 *        6  unassigned byte sequence
218 *           19..16 not used, 0
219 *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
220 *                  this does not contain a final offset delta because the main
221 *                  purpose of this action code is to save scalar offset values;
222 *                  therefore, fallback values cannot be assigned to byte
223 *                  sequences that result in this action code
224 *        7  illegal byte sequence
225 *           19..16 not used, 0
226 *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
227 *        8  state change only
228 *           19..0  not used, 0
229 *           useful for state changes in simple stateful encodings,
230 *           at Shift-In/Shift-Out codes
231 *
232 *
233 *        9..15 reserved for future use
234 *           current implementations will only perform a state change
235 *           and ignore bits 19..0
236 *
237 * An encoding with contiguous ranges of unassigned byte sequences, like
238 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
239 * at least two states for the trail bytes:
240 * One trail byte state that results in code points, and one that only
241 * has "unassigned" and "illegal" terminal states.
242 *
243 * Note: partly by accident, this data structure supports simple stateful
244 * encodings without any additional logic.
245 * Currently, only simple Shift-In/Shift-Out schemes are handled with
246 * appropriate state tables (especially EBCDIC_STATEFUL!).
247 *
248 * MBCS version 2 added:
249 * unassigned and illegal action codes have U+fffe and U+ffff
250 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
251 *
252 * Converting from Unicode to codepage bytes --------------------------------***
253 *
254 * The conversion data structure for fromUnicode is designed for the known
255 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
256 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
257 * a roundtrip mapping.
258 *
259 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
260 * like in the character properties table.
261 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
262 * with the resulting bytes is at offsetFromUBytes.
263 *
264 * Beginning with version 4, single-byte codepages have a significantly different
265 * trie compared to other codepages.
266 * In all cases, the entry in stage 1 is directly the index of the block of
267 * 64 entries in stage 2.
268 *
269 * Single-byte lookup:
270 *
271 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
272 * Stage 3 contains one 16-bit word per result:
273 * Bits 15..8 indicate the kind of result:
274 *    f  roundtrip result
275 *    c  fallback result from private-use code point
276 *    8  fallback result from other code points
277 *    0  unassigned
278 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
279 *
280 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
281 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
282 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
283 * ASCII code points can be looked up with a linear array access into stage 3.
284 * See maxFastUChar and other details in ucnvmbcs.h.
285 *
286 * Multi-byte lookup:
287 *
288 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
289 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
290 *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
291 *             If this test is false, then a non-zero result will be interpreted as
292 *             a fallback mapping.
293 * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
294 *
295 * Stage 3 contains 2, 3, or 4 bytes per result.
296 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
297 * while 3 bytes are stored as bytes in big-endian order.
298 * Leading zero bytes are ignored, and the number of bytes is counted.
299 * A zero byte mapping result is possible as a roundtrip result.
300 * For some output types, the actual result is processed from this;
301 * see ucnv_MBCSFromUnicodeWithOffsets().
302 *
303 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
304 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
305 *
306 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
307 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
308 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
309 * ASCII code points can be looked up with a linear array access into stage 3.
310 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
311 *
312 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
313 * for compaction.
314 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
315 * may overlap by any number of entries.
316 *
317 * MBCS version 2 added:
318 * the converter checks for known output types, which allows
319 * adding new ones without crashing an unaware converter
320 */
321
322static const UConverterImpl _SBCSUTF8Impl;
323static const UConverterImpl _DBCSUTF8Impl;
324
325/* GB 18030 data ------------------------------------------------------------ */
326
327/* helper macros for linear values for GB 18030 four-byte sequences */
328#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
329
330#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
331
332#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
333
334/*
335 * Some ranges of GB 18030 where both the Unicode code points and the
336 * GB four-byte sequences are contiguous and are handled algorithmically by
337 * the special callback functions below.
338 * The values are start & end of Unicode & GB codes.
339 *
340 * Note that single surrogates are not mapped by GB 18030
341 * as of the re-released mapping tables from 2000-nov-30.
342 */
343static const uint32_t
344gb18030Ranges[13][4]={
345    {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
346    {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
347    {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
348    {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
349    {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
350    {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
351    {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
352    {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
353    {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
354    {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
355    {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
356    {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
357    {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
358};
359
360/* bit flag for UConverter.options indicating GB 18030 special handling */
361#define _MBCS_OPTION_GB18030 0x8000
362
363/* Miscellaneous ------------------------------------------------------------ */
364
365/* similar to ucnv_MBCSGetNextUChar() but recursive */
366static void
367_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
368                       const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
369                       const USetAdder *sa,
370                       UConverterUnicodeSet which,
371                       uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
372
373                       UErrorCode *pErrorCode) {
374    int32_t b, entry;
375
376    for(b=lowByte; b<=highByte; ++b) {
377        entry=stateTable[state][b];
378        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
379            _getUnicodeSetForBytes(
380                sharedData, stateTable, unicodeCodeUnits,
381                sa, which,
382                (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
383                offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
384                0, 0xff,
385                pErrorCode);
386        } else {
387            UChar32 c;
388            int32_t rowOffset=offset;
389            uint8_t action;
390
391            c=U_SENTINEL;
392
393            /*
394             * An if-else-if chain provides more reliable performance for
395             * the most common cases compared to a switch.
396             */
397            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
398            if(action==MBCS_STATE_VALID_DIRECT_16) {
399                /* output BMP code point */
400                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
401            } else if(action==MBCS_STATE_VALID_16) {
402                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
403                c=unicodeCodeUnits[offset];
404                if(c<0xfffe) {
405                    /* output BMP code point */
406                } else {
407                    c=U_SENTINEL;
408                }
409            } else if(action==MBCS_STATE_VALID_16_PAIR) {
410                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
411                c=unicodeCodeUnits[offset++];
412                if(c<0xd800) {
413                    /* output BMP code point below 0xd800 */
414                } else if(c<=0xdbff) {
415                    /* output roundtrip or fallback supplementary code point */
416                    c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
417                } else if(c==0xe000) {
418                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
419                    c=unicodeCodeUnits[offset];
420                } else {
421                    c=U_SENTINEL;
422                }
423            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
424                /* output supplementary code point */
425                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
426            }
427
428            if(c>=0) {
429                sa->add(sa->set, c);
430            }
431            offset=rowOffset;
432        }
433    }
434}
435
436/*
437 * Internal function returning a UnicodeSet for toUnicode() conversion.
438 * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
439 * In the future, if we add support for reverse-fallback sets, this function
440 * needs to be updated, and called for each initial state.
441 * Does not currently handle extensions.
442 * Does not empty the set first.
443 */
444U_CFUNC void
445ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
446                           const USetAdder *sa,
447                           UConverterUnicodeSet which,
448                           uint8_t state, int32_t lowByte, int32_t highByte,
449                           UErrorCode *pErrorCode) {
450    _getUnicodeSetForBytes(
451        sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits,
452        sa, which,
453        state, 0, lowByte, highByte,
454        pErrorCode);
455}
456
457U_CFUNC void
458ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
459                             const USetAdder *sa,
460                             UConverterUnicodeSet which,
461                             UErrorCode *pErrorCode) {
462    const UConverterMBCSTable *mbcsTable;
463    const uint16_t *table;
464
465    uint32_t st3;
466    uint16_t st1, maxStage1, st2;
467
468    UChar32 c;
469
470    /* enumerate the from-Unicode trie table */
471    mbcsTable=&sharedData->mbcs;
472    table=mbcsTable->fromUnicodeTable;
473    if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
474        maxStage1=0x440;
475    } else {
476        maxStage1=0x40;
477    }
478
479    c=0; /* keep track of the current code point while enumerating */
480
481    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
482        const uint16_t *stage2, *stage3, *results;
483
484        results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
485
486        for(st1=0; st1<maxStage1; ++st1) {
487            st2=table[st1];
488            if(st2>maxStage1) {
489                stage2=table+st2;
490                for(st2=0; st2<64; ++st2) {
491                    if((st3=stage2[st2])!=0) {
492                        /* read the stage 3 block */
493                        stage3=results+st3;
494
495                        /*
496                         * Add code points for which the roundtrip flag is set.
497                         * Once we get a set for fallback mappings, we have to use
498                         * a threshold variable with a value of 0x800.
499                         * See ucnv_MBCSSingleFromBMPWithOffsets() and
500                         * MBCS_SINGLE_RESULT_FROM_U() for details.
501                         */
502                        do {
503                            if(*stage3++>=0xf00) {
504                                sa->add(sa->set, c);
505                            }
506                        } while((++c&0xf)!=0);
507                    } else {
508                        c+=16; /* empty stage 3 block */
509                    }
510                }
511            } else {
512                c+=1024; /* empty stage 2 block */
513            }
514        }
515    } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
516        /* ignore single-byte results */
517        const uint32_t *stage2;
518        const uint16_t *stage3, *results;
519
520        results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
521
522        for(st1=0; st1<maxStage1; ++st1) {
523            st2=table[st1];
524            if(st2>(maxStage1>>1)) {
525                stage2=(const uint32_t *)table+st2;
526                for(st2=0; st2<64; ++st2) {
527                    if((st3=stage2[st2])!=0) {
528                        /* read the stage 3 block */
529                        stage3=results+16*(uint32_t)(uint16_t)st3;
530
531                        /* get the roundtrip flags for the stage 3 block */
532                        st3>>=16;
533
534                        /*
535                         * Add code points for which the roundtrip flag is set.
536                         * Once we get a set for fallback mappings, we have to check
537                         * non-roundtrip stage 3 results for whether they are 0.
538                         * See ucnv_MBCSFromUnicodeWithOffsets() for details.
539                         *
540                         * Ignore single-byte results (<0x100).
541                         */
542                        do {
543                            if((st3&1)!=0 && *stage3>=0x100) {
544                                sa->add(sa->set, c);
545                            }
546                            st3>>=1;
547                            ++stage3;
548                        } while((++c&0xf)!=0);
549                    } else {
550                        c+=16; /* empty stage 3 block */
551                    }
552                }
553            } else {
554                c+=1024; /* empty stage 2 block */
555            }
556        }
557    } else {
558        const uint32_t *stage2;
559
560        for(st1=0; st1<maxStage1; ++st1) {
561            st2=table[st1];
562            if(st2>(maxStage1>>1)) {
563                stage2=(const uint32_t *)table+st2;
564                for(st2=0; st2<64; ++st2) {
565                    if((st3=stage2[st2])!=0) {
566                        /* get the roundtrip flags for the stage 3 block */
567                        st3>>=16;
568
569                        /*
570                         * Add code points for which the roundtrip flag is set.
571                         * Once we get a set for fallback mappings, we have to check
572                         * non-roundtrip stage 3 results for whether they are 0.
573                         * See ucnv_MBCSFromUnicodeWithOffsets() for details.
574                         */
575                        do {
576                            if(st3&1) {
577                                sa->add(sa->set, c);
578                            }
579                            st3>>=1;
580                        } while((++c&0xf)!=0);
581                    } else {
582                        c+=16; /* empty stage 3 block */
583                    }
584                }
585            } else {
586                c+=1024; /* empty stage 2 block */
587            }
588        }
589    }
590
591    ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
592}
593
594static void
595ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
596                   const USetAdder *sa,
597                   UConverterUnicodeSet which,
598                   UErrorCode *pErrorCode) {
599    if(cnv->options&_MBCS_OPTION_GB18030) {
600        sa->addRange(sa->set, 0, 0xd7ff);
601        sa->addRange(sa->set, 0xe000, 0x10ffff);
602    } else {
603        ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
604    }
605}
606
607/* conversion extensions for input not in the main table -------------------- */
608
609/*
610 * Hardcoded extension handling for GB 18030.
611 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
612 *
613 * In the future, conversion extensions may handle m:n mappings and delta tables,
614 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
615 *
616 * If an input character cannot be mapped, then these functions set an error
617 * code. The framework will then call the callback function.
618 */
619
620/*
621 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
622 *         else return 0 after output has been written to the target
623 */
624static UChar32
625_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
626          UChar32 cp,
627          const UChar **source, const UChar *sourceLimit,
628          uint8_t **target, const uint8_t *targetLimit,
629          int32_t **offsets, int32_t sourceIndex,
630          UBool flush,
631          UErrorCode *pErrorCode) {
632    const int32_t *cx;
633
634    cnv->useSubChar1=FALSE;
635
636    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
637        ucnv_extInitialMatchFromU(
638            cnv, cx,
639            cp, source, sourceLimit,
640            (char **)target, (char *)targetLimit,
641            offsets, sourceIndex,
642            flush,
643            pErrorCode)
644    ) {
645        return 0; /* an extension mapping handled the input */
646    }
647
648    /* GB 18030 */
649    if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
650        const uint32_t *range;
651        int32_t i;
652
653        range=gb18030Ranges[0];
654        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
655            if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
656                /* found the Unicode code point, output the four-byte sequence for it */
657                uint32_t linear;
658                char bytes[4];
659
660                /* get the linear value of the first GB 18030 code in this range */
661                linear=range[2]-LINEAR_18030_BASE;
662
663                /* add the offset from the beginning of the range */
664                linear+=((uint32_t)cp-range[0]);
665
666                /* turn this into a four-byte sequence */
667                bytes[3]=(char)(0x30+linear%10); linear/=10;
668                bytes[2]=(char)(0x81+linear%126); linear/=126;
669                bytes[1]=(char)(0x30+linear%10); linear/=10;
670                bytes[0]=(char)(0x81+linear);
671
672                /* output this sequence */
673                ucnv_fromUWriteBytes(cnv,
674                                     bytes, 4, (char **)target, (char *)targetLimit,
675                                     offsets, sourceIndex, pErrorCode);
676                return 0;
677            }
678        }
679    }
680
681    /* no mapping */
682    *pErrorCode=U_INVALID_CHAR_FOUND;
683    return cp;
684}
685
686/*
687 * Input sequence: cnv->toUBytes[0..length[
688 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
689 *         else return 0 after output has been written to the target
690 */
691static int8_t
692_extToU(UConverter *cnv, const UConverterSharedData *sharedData,
693        int8_t length,
694        const uint8_t **source, const uint8_t *sourceLimit,
695        UChar **target, const UChar *targetLimit,
696        int32_t **offsets, int32_t sourceIndex,
697        UBool flush,
698        UErrorCode *pErrorCode) {
699    const int32_t *cx;
700
701    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
702        ucnv_extInitialMatchToU(
703            cnv, cx,
704            length, (const char **)source, (const char *)sourceLimit,
705            target, targetLimit,
706            offsets, sourceIndex,
707            flush,
708            pErrorCode)
709    ) {
710        return 0; /* an extension mapping handled the input */
711    }
712
713    /* GB 18030 */
714    if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
715        const uint32_t *range;
716        uint32_t linear;
717        int32_t i;
718
719        linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
720        range=gb18030Ranges[0];
721        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
722            if(range[2]<=linear && linear<=range[3]) {
723                /* found the sequence, output the Unicode code point for it */
724                *pErrorCode=U_ZERO_ERROR;
725
726                /* add the linear difference between the input and start sequences to the start code point */
727                linear=range[0]+(linear-range[2]);
728
729                /* output this code point */
730                ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
731
732                return 0;
733            }
734        }
735    }
736
737    /* no mapping */
738    *pErrorCode=U_INVALID_CHAR_FOUND;
739    return length;
740}
741
742/* EBCDIC swap LF<->NL ------------------------------------------------------ */
743
744/*
745 * This code modifies a standard EBCDIC<->Unicode mapping table for
746 * OS/390 (z/OS) Unix System Services (Open Edition).
747 * The difference is in the mapping of Line Feed and New Line control codes:
748 * Standard EBCDIC maps
749 *
750 *   <U000A> \x25 |0
751 *   <U0085> \x15 |0
752 *
753 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
754 * mapping
755 *
756 *   <U000A> \x15 |0
757 *   <U0085> \x25 |0
758 *
759 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
760 * by copying it into allocated memory and swapping the LF and NL values.
761 * It allows to support the same EBCDIC charset in both versions without
762 * duplicating the entire installed table.
763 */
764
765/* standard EBCDIC codes */
766#define EBCDIC_LF 0x25
767#define EBCDIC_NL 0x15
768
769/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
770#define EBCDIC_RT_LF 0xf25
771#define EBCDIC_RT_NL 0xf15
772
773/* Unicode code points */
774#define U_LF 0x0a
775#define U_NL 0x85
776
777static UBool
778_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
779    UConverterMBCSTable *mbcsTable;
780
781    const uint16_t *table, *results;
782    const uint8_t *bytes;
783
784    int32_t (*newStateTable)[256];
785    uint16_t *newResults;
786    uint8_t *p;
787    char *name;
788
789    uint32_t stage2Entry;
790    uint32_t size, sizeofFromUBytes;
791
792    mbcsTable=&sharedData->mbcs;
793
794    table=mbcsTable->fromUnicodeTable;
795    bytes=mbcsTable->fromUnicodeBytes;
796    results=(const uint16_t *)bytes;
797
798    /*
799     * Check that this is an EBCDIC table with SBCS portion -
800     * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
801     *
802     * If not, ignore the option. Options are always ignored if they do not apply.
803     */
804    if(!(
805         (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
806         mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
807         mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
808    )) {
809        return FALSE;
810    }
811
812    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
813        if(!(
814             EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
815             EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
816        )) {
817            return FALSE;
818        }
819    } else /* MBCS_OUTPUT_2_SISO */ {
820        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
821        if(!(
822             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
823             EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
824        )) {
825            return FALSE;
826        }
827
828        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
829        if(!(
830             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
831             EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
832        )) {
833            return FALSE;
834        }
835    }
836
837    if(mbcsTable->fromUBytesLength>0) {
838        /*
839         * We _know_ the number of bytes in the fromUnicodeBytes array
840         * starting with header.version 4.1.
841         */
842        sizeofFromUBytes=mbcsTable->fromUBytesLength;
843    } else {
844        /*
845         * Otherwise:
846         * There used to be code to enumerate the fromUnicode
847         * trie and find the highest entry, but it was removed in ICU 3.2
848         * because it was not tested and caused a low code coverage number.
849         * See Jitterbug 3674.
850         * This affects only some .cnv file formats with a header.version
851         * below 4.1, and only when swaplfnl is requested.
852         *
853         * ucnvmbcs.c revision 1.99 is the last one with the
854         * ucnv_MBCSSizeofFromUBytes() function.
855         */
856        *pErrorCode=U_INVALID_FORMAT_ERROR;
857        return FALSE;
858    }
859
860    /*
861     * The table has an appropriate format.
862     * Allocate and build
863     * - a modified to-Unicode state table
864     * - a modified from-Unicode output array
865     * - a converter name string with the swap option appended
866     */
867    size=
868        mbcsTable->countStates*1024+
869        sizeofFromUBytes+
870        UCNV_MAX_CONVERTER_NAME_LENGTH+20;
871    p=(uint8_t *)uprv_malloc(size);
872    if(p==NULL) {
873        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
874        return FALSE;
875    }
876
877    /* copy and modify the to-Unicode state table */
878    newStateTable=(int32_t (*)[256])p;
879    uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
880
881    newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
882    newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
883
884    /* copy and modify the from-Unicode result table */
885    newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
886    uprv_memcpy(newResults, bytes, sizeofFromUBytes);
887
888    /* conveniently, the table access macros work on the left side of expressions */
889    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
890        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
891        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
892    } else /* MBCS_OUTPUT_2_SISO */ {
893        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
894        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
895
896        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
897        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
898    }
899
900    /* set the canonical converter name */
901    name=(char *)newResults+sizeofFromUBytes;
902    uprv_strcpy(name, sharedData->staticData->name);
903    uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
904
905    /* set the pointers */
906    umtx_lock(NULL);
907    if(mbcsTable->swapLFNLStateTable==NULL) {
908        mbcsTable->swapLFNLStateTable=newStateTable;
909        mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
910        mbcsTable->swapLFNLName=name;
911
912        newStateTable=NULL;
913    }
914    umtx_unlock(NULL);
915
916    /* release the allocated memory if another thread beat us to it */
917    if(newStateTable!=NULL) {
918        uprv_free(newStateTable);
919    }
920    return TRUE;
921}
922
923/* MBCS setup functions ----------------------------------------------------- */
924
925static void
926ucnv_MBCSLoad(UConverterSharedData *sharedData,
927          UConverterLoadArgs *pArgs,
928          const uint8_t *raw,
929          UErrorCode *pErrorCode) {
930    UDataInfo info;
931    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
932    _MBCSHeader *header=(_MBCSHeader *)raw;
933    uint32_t offset;
934
935    if(header->version[0]!=4) {
936        *pErrorCode=U_INVALID_TABLE_FORMAT;
937        return;
938    }
939
940    mbcsTable->outputType=(uint8_t)header->flags;
941
942    /* extension data, header version 4.2 and higher */
943    offset=header->flags>>8;
944    if(offset!=0) {
945        mbcsTable->extIndexes=(const int32_t *)(raw+offset);
946    }
947
948    if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
949        UConverterLoadArgs args={ 0 };
950        UConverterSharedData *baseSharedData;
951        const int32_t *extIndexes;
952        const char *baseName;
953
954        /* extension-only file, load the base table and set values appropriately */
955        if((extIndexes=mbcsTable->extIndexes)==NULL) {
956            /* extension-only file without extension */
957            *pErrorCode=U_INVALID_TABLE_FORMAT;
958            return;
959        }
960
961        if(pArgs->nestedLoads!=1) {
962            /* an extension table must not be loaded as a base table */
963            *pErrorCode=U_INVALID_TABLE_FILE;
964            return;
965        }
966
967        /* load the base table */
968        baseName=(const char *)(header+1);
969        if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
970            /* forbid loading this same extension-only file */
971            *pErrorCode=U_INVALID_TABLE_FORMAT;
972            return;
973        }
974
975        /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
976        args.size=sizeof(UConverterLoadArgs);
977        args.nestedLoads=2;
978        args.reserved=pArgs->reserved;
979        args.options=pArgs->options;
980        args.pkg=pArgs->pkg;
981        args.name=baseName;
982        baseSharedData=ucnv_load(&args, pErrorCode);
983        if(U_FAILURE(*pErrorCode)) {
984            return;
985        }
986        if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
987            baseSharedData->mbcs.baseSharedData!=NULL
988        ) {
989            ucnv_unload(baseSharedData);
990            *pErrorCode=U_INVALID_TABLE_FORMAT;
991            return;
992        }
993
994        /* copy the base table data */
995        uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
996
997        /* overwrite values with relevant ones for the extension converter */
998        mbcsTable->baseSharedData=baseSharedData;
999        mbcsTable->extIndexes=extIndexes;
1000
1001        /*
1002         * It would be possible to share the swapLFNL data with a base converter,
1003         * but the generated name would have to be different, and the memory
1004         * would have to be free'd only once.
1005         * It is easier to just create the data for the extension converter
1006         * separately when it is requested.
1007         */
1008        mbcsTable->swapLFNLStateTable=NULL;
1009        mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1010        mbcsTable->swapLFNLName=NULL;
1011
1012        /*
1013         * Set a special, runtime-only outputType if the extension converter
1014         * is a DBCS version of a base converter that also maps single bytes.
1015         */
1016        if( sharedData->staticData->conversionType==UCNV_DBCS ||
1017                (sharedData->staticData->conversionType==UCNV_MBCS &&
1018                 sharedData->staticData->minBytesPerChar>=2)
1019        ) {
1020            if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1021                /* the base converter is SI/SO-stateful */
1022                int32_t entry;
1023
1024                /* get the dbcs state from the state table entry for SO=0x0e */
1025                entry=mbcsTable->stateTable[0][0xe];
1026                if( MBCS_ENTRY_IS_FINAL(entry) &&
1027                    MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1028                    MBCS_ENTRY_FINAL_STATE(entry)!=0
1029                ) {
1030                    mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1031
1032                    mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1033                }
1034            } else if(
1035                baseSharedData->staticData->conversionType==UCNV_MBCS &&
1036                baseSharedData->staticData->minBytesPerChar==1 &&
1037                baseSharedData->staticData->maxBytesPerChar==2 &&
1038                mbcsTable->countStates<=127
1039            ) {
1040                /* non-stateful base converter, need to modify the state table */
1041                int32_t (*newStateTable)[256];
1042                int32_t *state;
1043                int32_t i, count;
1044
1045                /* allocate a new state table and copy the base state table contents */
1046                count=mbcsTable->countStates;
1047                newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1048                if(newStateTable==NULL) {
1049                    ucnv_unload(baseSharedData);
1050                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1051                    return;
1052                }
1053
1054                uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1055
1056                /* change all final single-byte entries to go to a new all-illegal state */
1057                state=newStateTable[0];
1058                for(i=0; i<256; ++i) {
1059                    if(MBCS_ENTRY_IS_FINAL(state[i])) {
1060                        state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1061                    }
1062                }
1063
1064                /* build the new all-illegal state */
1065                state=newStateTable[count];
1066                for(i=0; i<256; ++i) {
1067                    state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1068                }
1069                mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1070                mbcsTable->countStates=(uint8_t)(count+1);
1071                mbcsTable->stateTableOwned=TRUE;
1072
1073                mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1074            }
1075        }
1076
1077        /*
1078         * unlike below for files with base tables, do not get the unicodeMask
1079         * from the sharedData; instead, use the base table's unicodeMask,
1080         * which we copied in the memcpy above;
1081         * this is necessary because the static data unicodeMask, especially
1082         * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1083         */
1084    } else {
1085        /* conversion file with a base table; an additional extension table is optional */
1086        /* make sure that the output type is known */
1087        switch(mbcsTable->outputType) {
1088        case MBCS_OUTPUT_1:
1089        case MBCS_OUTPUT_2:
1090        case MBCS_OUTPUT_3:
1091        case MBCS_OUTPUT_4:
1092        case MBCS_OUTPUT_3_EUC:
1093        case MBCS_OUTPUT_4_EUC:
1094        case MBCS_OUTPUT_2_SISO:
1095            /* OK */
1096            break;
1097        default:
1098            *pErrorCode=U_INVALID_TABLE_FORMAT;
1099            return;
1100        }
1101
1102        mbcsTable->countStates=(uint8_t)header->countStates;
1103        mbcsTable->countToUFallbacks=header->countToUFallbacks;
1104        mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
1105        mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1106        mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1107
1108        mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1109        mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1110        mbcsTable->fromUBytesLength=header->fromUBytesLength;
1111
1112        /*
1113         * converter versions 6.1 and up contain a unicodeMask that is
1114         * used here to select the most efficient function implementations
1115         */
1116        info.size=sizeof(UDataInfo);
1117        udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1118        if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1119            /* mask off possible future extensions to be safe */
1120            mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1121        } else {
1122            /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1123            mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1124        }
1125
1126        /*
1127         * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1128         * Check for the header version, SBCS vs. MBCS, and for whether the
1129         * data structures are optimized for code points as high as what the
1130         * runtime code is designed for.
1131         * The implementation does not handle mapping tables with entries for
1132         * unpaired surrogates.
1133         */
1134        if( header->version[1]>=3 &&
1135            (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1136            (mbcsTable->countStates==1 ?
1137                (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1138                (header->version[2]>=(MBCS_FAST_MAX>>8))
1139            )
1140        ) {
1141            mbcsTable->utf8Friendly=TRUE;
1142
1143            if(mbcsTable->countStates==1) {
1144                /*
1145                 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1146                 * Build a table with indexes to each block, to be used instead of
1147                 * the regular stage 1/2 table.
1148                 */
1149                int32_t i;
1150                for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1151                    mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1152                }
1153                /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1154                mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1155            } else {
1156                /*
1157                 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1158                 * The .cnv file is prebuilt with an additional stage table with indexes
1159                 * to each block.
1160                 */
1161                mbcsTable->mbcsIndex=(const uint16_t *)(mbcsTable->fromUnicodeBytes+mbcsTable->fromUBytesLength);
1162                mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1163            }
1164        }
1165
1166        /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1167        {
1168            uint32_t asciiRoundtrips=0xffffffff;
1169            int32_t i;
1170
1171            for(i=0; i<0x80; ++i) {
1172                if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1173                    asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1174                }
1175            }
1176            mbcsTable->asciiRoundtrips=asciiRoundtrips;
1177        }
1178    }
1179
1180    /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1181    if(mbcsTable->utf8Friendly) {
1182        if(mbcsTable->countStates==1) {
1183            sharedData->impl=&_SBCSUTF8Impl;
1184        } else {
1185            if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1186                sharedData->impl=&_DBCSUTF8Impl;
1187            }
1188        }
1189    }
1190
1191    if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1192        /*
1193         * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1194         * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1195         */
1196        mbcsTable->asciiRoundtrips=0;
1197    }
1198}
1199
1200static void
1201ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1202    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1203
1204    if(mbcsTable->swapLFNLStateTable!=NULL) {
1205        uprv_free(mbcsTable->swapLFNLStateTable);
1206    }
1207    if(mbcsTable->stateTableOwned) {
1208        uprv_free((void *)mbcsTable->stateTable);
1209    }
1210    if(mbcsTable->baseSharedData!=NULL) {
1211        ucnv_unload(mbcsTable->baseSharedData);
1212    }
1213}
1214
1215static void
1216ucnv_MBCSOpen(UConverter *cnv,
1217          const char *name,
1218          const char *locale,
1219          uint32_t options,
1220          UErrorCode *pErrorCode) {
1221    UConverterMBCSTable *mbcsTable;
1222    const int32_t *extIndexes;
1223    uint8_t outputType;
1224    int8_t maxBytesPerUChar;
1225
1226    mbcsTable=&cnv->sharedData->mbcs;
1227    outputType=mbcsTable->outputType;
1228
1229    if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1230        /* the swaplfnl option does not apply, remove it */
1231        cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1232    }
1233
1234    if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
1235        /* do this because double-checked locking is broken */
1236        UBool isCached;
1237
1238        umtx_lock(NULL);
1239        isCached=mbcsTable->swapLFNLStateTable!=NULL;
1240        umtx_unlock(NULL);
1241
1242        if(!isCached) {
1243            if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1244                if(U_FAILURE(*pErrorCode)) {
1245                    return; /* something went wrong */
1246                }
1247
1248                /* the option does not apply, remove it */
1249                cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1250            }
1251        }
1252    }
1253
1254    if(uprv_strstr(name, "18030")!=NULL) {
1255        if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
1256            /* set a flag for GB 18030 mode, which changes the callback behavior */
1257            cnv->options|=_MBCS_OPTION_GB18030;
1258        }
1259    }
1260
1261    /* fix maxBytesPerUChar depending on outputType and options etc. */
1262    if(outputType==MBCS_OUTPUT_2_SISO) {
1263        cnv->maxBytesPerUChar=3; /* SO+DBCS */
1264    }
1265
1266    extIndexes=mbcsTable->extIndexes;
1267    if(extIndexes!=NULL) {
1268        maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1269        if(outputType==MBCS_OUTPUT_2_SISO) {
1270            ++maxBytesPerUChar; /* SO + multiple DBCS */
1271        }
1272
1273        if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1274            cnv->maxBytesPerUChar=maxBytesPerUChar;
1275        }
1276    }
1277
1278#if 0
1279    /*
1280     * documentation of UConverter fields used for status
1281     * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1282     */
1283
1284    /* toUnicode */
1285    cnv->toUnicodeStatus=0;     /* offset */
1286    cnv->mode=0;                /* state */
1287    cnv->toULength=0;           /* byteIndex */
1288
1289    /* fromUnicode */
1290    cnv->fromUChar32=0;
1291    cnv->fromUnicodeStatus=1;   /* prevLength */
1292#endif
1293}
1294
1295static const char *
1296ucnv_MBCSGetName(const UConverter *cnv) {
1297    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1298        return cnv->sharedData->mbcs.swapLFNLName;
1299    } else {
1300        return cnv->sharedData->staticData->name;
1301    }
1302}
1303
1304/* MBCS-to-Unicode conversion functions ------------------------------------- */
1305
1306static UChar32
1307ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
1308    const _MBCSToUFallback *toUFallbacks;
1309    uint32_t i, start, limit;
1310
1311    limit=mbcsTable->countToUFallbacks;
1312    if(limit>0) {
1313        /* do a binary search for the fallback mapping */
1314        toUFallbacks=mbcsTable->toUFallbacks;
1315        start=0;
1316        while(start<limit-1) {
1317            i=(start+limit)/2;
1318            if(offset<toUFallbacks[i].offset) {
1319                limit=i;
1320            } else {
1321                start=i;
1322            }
1323        }
1324
1325        /* did we really find it? */
1326        if(offset==toUFallbacks[start].offset) {
1327            return toUFallbacks[start].codePoint;
1328        }
1329    }
1330
1331    return 0xfffe;
1332}
1333
1334/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1335static void
1336ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1337                                UErrorCode *pErrorCode) {
1338    UConverter *cnv;
1339    const uint8_t *source, *sourceLimit;
1340    UChar *target;
1341    const UChar *targetLimit;
1342    int32_t *offsets;
1343
1344    const int32_t (*stateTable)[256];
1345
1346    int32_t sourceIndex;
1347
1348    int32_t entry;
1349    UChar c;
1350    uint8_t action;
1351
1352    /* set up the local pointers */
1353    cnv=pArgs->converter;
1354    source=(const uint8_t *)pArgs->source;
1355    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1356    target=pArgs->target;
1357    targetLimit=pArgs->targetLimit;
1358    offsets=pArgs->offsets;
1359
1360    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1361        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1362    } else {
1363        stateTable=cnv->sharedData->mbcs.stateTable;
1364    }
1365
1366    /* sourceIndex=-1 if the current character began in the previous buffer */
1367    sourceIndex=0;
1368
1369    /* conversion loop */
1370    while(source<sourceLimit) {
1371        /*
1372         * This following test is to see if available input would overflow the output.
1373         * It does not catch output of more than one code unit that
1374         * overflows as a result of a surrogate pair or callback output
1375         * from the last source byte.
1376         * Therefore, those situations also test for overflows and will
1377         * then break the loop, too.
1378         */
1379        if(target>=targetLimit) {
1380            /* target is full */
1381            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382            break;
1383        }
1384
1385        entry=stateTable[0][*source++];
1386        /* MBCS_ENTRY_IS_FINAL(entry) */
1387
1388        /* test the most common case first */
1389        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1390            /* output BMP code point */
1391            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1392            if(offsets!=NULL) {
1393                *offsets++=sourceIndex;
1394            }
1395
1396            /* normal end of action codes: prepare for a new character */
1397            ++sourceIndex;
1398            continue;
1399        }
1400
1401        /*
1402         * An if-else-if chain provides more reliable performance for
1403         * the most common cases compared to a switch.
1404         */
1405        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1406        if(action==MBCS_STATE_VALID_DIRECT_20 ||
1407           (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1408        ) {
1409            entry=MBCS_ENTRY_FINAL_VALUE(entry);
1410            /* output surrogate pair */
1411            *target++=(UChar)(0xd800|(UChar)(entry>>10));
1412            if(offsets!=NULL) {
1413                *offsets++=sourceIndex;
1414            }
1415            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1416            if(target<targetLimit) {
1417                *target++=c;
1418                if(offsets!=NULL) {
1419                    *offsets++=sourceIndex;
1420                }
1421            } else {
1422                /* target overflow */
1423                cnv->UCharErrorBuffer[0]=c;
1424                cnv->UCharErrorBufferLength=1;
1425                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1426                break;
1427            }
1428
1429            ++sourceIndex;
1430            continue;
1431        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1432            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1433                /* output BMP code point */
1434                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1435                if(offsets!=NULL) {
1436                    *offsets++=sourceIndex;
1437                }
1438
1439                ++sourceIndex;
1440                continue;
1441            }
1442        } else if(action==MBCS_STATE_UNASSIGNED) {
1443            /* just fall through */
1444        } else if(action==MBCS_STATE_ILLEGAL) {
1445            /* callback(illegal) */
1446            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1447        } else {
1448            /* reserved, must never occur */
1449            ++sourceIndex;
1450            continue;
1451        }
1452
1453        if(U_FAILURE(*pErrorCode)) {
1454            /* callback(illegal) */
1455            break;
1456        } else /* unassigned sequences indicated with byteIndex>0 */ {
1457            /* try an extension mapping */
1458            pArgs->source=(const char *)source;
1459            cnv->toUBytes[0]=*(source-1);
1460            cnv->toULength=_extToU(cnv, cnv->sharedData,
1461                                    1, &source, sourceLimit,
1462                                    &target, targetLimit,
1463                                    &offsets, sourceIndex,
1464                                    pArgs->flush,
1465                                    pErrorCode);
1466            sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
1467
1468            if(U_FAILURE(*pErrorCode)) {
1469                /* not mappable or buffer overflow */
1470                break;
1471            }
1472        }
1473    }
1474
1475    /* write back the updated pointers */
1476    pArgs->source=(const char *)source;
1477    pArgs->target=target;
1478    pArgs->offsets=offsets;
1479}
1480
1481/*
1482 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1483 * that only map to and from the BMP.
1484 * In addition to single-byte optimizations, the offset calculations
1485 * become much easier.
1486 */
1487static void
1488ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1489                            UErrorCode *pErrorCode) {
1490    UConverter *cnv;
1491    const uint8_t *source, *sourceLimit, *lastSource;
1492    UChar *target;
1493    int32_t targetCapacity, length;
1494    int32_t *offsets;
1495
1496    const int32_t (*stateTable)[256];
1497
1498    int32_t sourceIndex;
1499
1500    int32_t entry;
1501    uint8_t action;
1502
1503    /* set up the local pointers */
1504    cnv=pArgs->converter;
1505    source=(const uint8_t *)pArgs->source;
1506    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1507    target=pArgs->target;
1508    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1509    offsets=pArgs->offsets;
1510
1511    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1512        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1513    } else {
1514        stateTable=cnv->sharedData->mbcs.stateTable;
1515    }
1516
1517    /* sourceIndex=-1 if the current character began in the previous buffer */
1518    sourceIndex=0;
1519    lastSource=source;
1520
1521    /*
1522     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1523     * for the minimum of the sourceLength and targetCapacity
1524     */
1525    length=(int32_t)(sourceLimit-source);
1526    if(length<targetCapacity) {
1527        targetCapacity=length;
1528    }
1529
1530#if MBCS_UNROLL_SINGLE_TO_BMP
1531    /* unrolling makes it faster on Pentium III/Windows 2000 */
1532    /* unroll the loop with the most common case */
1533unrolled:
1534    if(targetCapacity>=16) {
1535        int32_t count, loops, oredEntries;
1536
1537        loops=count=targetCapacity>>4;
1538        do {
1539            oredEntries=entry=stateTable[0][*source++];
1540            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1541            oredEntries|=entry=stateTable[0][*source++];
1542            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1543            oredEntries|=entry=stateTable[0][*source++];
1544            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1545            oredEntries|=entry=stateTable[0][*source++];
1546            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1547            oredEntries|=entry=stateTable[0][*source++];
1548            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1549            oredEntries|=entry=stateTable[0][*source++];
1550            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1551            oredEntries|=entry=stateTable[0][*source++];
1552            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1553            oredEntries|=entry=stateTable[0][*source++];
1554            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1555            oredEntries|=entry=stateTable[0][*source++];
1556            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1557            oredEntries|=entry=stateTable[0][*source++];
1558            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1559            oredEntries|=entry=stateTable[0][*source++];
1560            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1561            oredEntries|=entry=stateTable[0][*source++];
1562            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1563            oredEntries|=entry=stateTable[0][*source++];
1564            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1565            oredEntries|=entry=stateTable[0][*source++];
1566            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1567            oredEntries|=entry=stateTable[0][*source++];
1568            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1569            oredEntries|=entry=stateTable[0][*source++];
1570            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1571
1572            /* were all 16 entries really valid? */
1573            if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
1574                /* no, return to the first of these 16 */
1575                source-=16;
1576                target-=16;
1577                break;
1578            }
1579        } while(--count>0);
1580        count=loops-count;
1581        targetCapacity-=16*count;
1582
1583        if(offsets!=NULL) {
1584            lastSource+=16*count;
1585            while(count>0) {
1586                *offsets++=sourceIndex++;
1587                *offsets++=sourceIndex++;
1588                *offsets++=sourceIndex++;
1589                *offsets++=sourceIndex++;
1590                *offsets++=sourceIndex++;
1591                *offsets++=sourceIndex++;
1592                *offsets++=sourceIndex++;
1593                *offsets++=sourceIndex++;
1594                *offsets++=sourceIndex++;
1595                *offsets++=sourceIndex++;
1596                *offsets++=sourceIndex++;
1597                *offsets++=sourceIndex++;
1598                *offsets++=sourceIndex++;
1599                *offsets++=sourceIndex++;
1600                *offsets++=sourceIndex++;
1601                *offsets++=sourceIndex++;
1602                --count;
1603            }
1604        }
1605    }
1606#endif
1607
1608    /* conversion loop */
1609    while(targetCapacity>0) {
1610        entry=stateTable[0][*source++];
1611        /* MBCS_ENTRY_IS_FINAL(entry) */
1612
1613        /* test the most common case first */
1614        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1615            /* output BMP code point */
1616            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1617            --targetCapacity;
1618            continue;
1619        }
1620
1621        /*
1622         * An if-else-if chain provides more reliable performance for
1623         * the most common cases compared to a switch.
1624         */
1625        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1626        if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1627            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1628                /* output BMP code point */
1629                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1630                --targetCapacity;
1631                continue;
1632            }
1633        } else if(action==MBCS_STATE_UNASSIGNED) {
1634            /* just fall through */
1635        } else if(action==MBCS_STATE_ILLEGAL) {
1636            /* callback(illegal) */
1637            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1638        } else {
1639            /* reserved, must never occur */
1640            continue;
1641        }
1642
1643        /* set offsets since the start or the last extension */
1644        if(offsets!=NULL) {
1645            int32_t count=(int32_t)(source-lastSource);
1646
1647            /* predecrement: do not set the offset for the callback-causing character */
1648            while(--count>0) {
1649                *offsets++=sourceIndex++;
1650            }
1651            /* offset and sourceIndex are now set for the current character */
1652        }
1653
1654        if(U_FAILURE(*pErrorCode)) {
1655            /* callback(illegal) */
1656            break;
1657        } else /* unassigned sequences indicated with byteIndex>0 */ {
1658            /* try an extension mapping */
1659            lastSource=source;
1660            cnv->toUBytes[0]=*(source-1);
1661            cnv->toULength=_extToU(cnv, cnv->sharedData,
1662                                    1, &source, sourceLimit,
1663                                    &target, target+targetCapacity,
1664                                    &offsets, sourceIndex,
1665                                    pArgs->flush,
1666                                    pErrorCode);
1667            sourceIndex+=1+(int32_t)(source-lastSource);
1668
1669            if(U_FAILURE(*pErrorCode)) {
1670                /* not mappable or buffer overflow */
1671                break;
1672            }
1673
1674            /* recalculate the targetCapacity after an extension mapping */
1675            targetCapacity=(int32_t)(pArgs->targetLimit-target);
1676            length=(int32_t)(sourceLimit-source);
1677            if(length<targetCapacity) {
1678                targetCapacity=length;
1679            }
1680        }
1681
1682#if MBCS_UNROLL_SINGLE_TO_BMP
1683        /* unrolling makes it faster on Pentium III/Windows 2000 */
1684        goto unrolled;
1685#endif
1686    }
1687
1688    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
1689        /* target is full */
1690        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1691    }
1692
1693    /* set offsets since the start or the last callback */
1694    if(offsets!=NULL) {
1695        size_t count=source-lastSource;
1696        while(count>0) {
1697            *offsets++=sourceIndex++;
1698            --count;
1699        }
1700    }
1701
1702    /* write back the updated pointers */
1703    pArgs->source=(const char *)source;
1704    pArgs->target=target;
1705    pArgs->offsets=offsets;
1706}
1707
1708U_CFUNC void
1709ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1710                          UErrorCode *pErrorCode) {
1711    UConverter *cnv;
1712    const uint8_t *source, *sourceLimit;
1713    UChar *target;
1714    const UChar *targetLimit;
1715    int32_t *offsets;
1716
1717    const int32_t (*stateTable)[256];
1718    const uint16_t *unicodeCodeUnits;
1719
1720    uint32_t offset;
1721    uint8_t state;
1722    int8_t byteIndex;
1723    uint8_t *bytes;
1724
1725    int32_t sourceIndex, nextSourceIndex;
1726
1727    int32_t entry;
1728    UChar c;
1729    uint8_t action;
1730
1731    /* use optimized function if possible */
1732    cnv=pArgs->converter;
1733
1734    if(cnv->preToULength>0) {
1735        /*
1736         * pass sourceIndex=-1 because we continue from an earlier buffer
1737         * in the future, this may change with continuous offsets
1738         */
1739        ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
1740
1741        if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
1742            return;
1743        }
1744    }
1745
1746    if(cnv->sharedData->mbcs.countStates==1) {
1747        if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1748            ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
1749        } else {
1750            ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
1751        }
1752        return;
1753    }
1754
1755    /* set up the local pointers */
1756    source=(const uint8_t *)pArgs->source;
1757    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1758    target=pArgs->target;
1759    targetLimit=pArgs->targetLimit;
1760    offsets=pArgs->offsets;
1761
1762    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1763        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1764    } else {
1765        stateTable=cnv->sharedData->mbcs.stateTable;
1766    }
1767    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
1768
1769    /* get the converter state from UConverter */
1770    offset=cnv->toUnicodeStatus;
1771    byteIndex=cnv->toULength;
1772    bytes=cnv->toUBytes;
1773
1774    /*
1775     * if we are in the SBCS state for a DBCS-only converter,
1776     * then load the DBCS state from the MBCS data
1777     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
1778     */
1779    if((state=(uint8_t)(cnv->mode))==0) {
1780        state=cnv->sharedData->mbcs.dbcsOnlyState;
1781    }
1782
1783    /* sourceIndex=-1 if the current character began in the previous buffer */
1784    sourceIndex=byteIndex==0 ? 0 : -1;
1785    nextSourceIndex=0;
1786
1787    /* conversion loop */
1788    while(source<sourceLimit) {
1789        /*
1790         * This following test is to see if available input would overflow the output.
1791         * It does not catch output of more than one code unit that
1792         * overflows as a result of a surrogate pair or callback output
1793         * from the last source byte.
1794         * Therefore, those situations also test for overflows and will
1795         * then break the loop, too.
1796         */
1797        if(target>=targetLimit) {
1798            /* target is full */
1799            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1800            break;
1801        }
1802
1803        if(byteIndex==0) {
1804            /* optimized loop for 1/2-byte input and BMP output */
1805            if(offsets==NULL) {
1806                do {
1807                    entry=stateTable[state][*source];
1808                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1809                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1810                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1811
1812                        ++source;
1813                        if( source<sourceLimit &&
1814                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
1815                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
1816                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
1817                        ) {
1818                            ++source;
1819                            *target++=c;
1820                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1821                            offset=0;
1822                        } else {
1823                            /* set the state and leave the optimized loop */
1824                            bytes[0]=*(source-1);
1825                            byteIndex=1;
1826                            break;
1827                        }
1828                    } else {
1829                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1830                            /* output BMP code point */
1831                            ++source;
1832                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1833                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1834                        } else {
1835                            /* leave the optimized loop */
1836                            break;
1837                        }
1838                    }
1839                } while(source<sourceLimit && target<targetLimit);
1840            } else /* offsets!=NULL */ {
1841                do {
1842                    entry=stateTable[state][*source];
1843                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1844                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1845                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1846
1847                        ++source;
1848                        if( source<sourceLimit &&
1849                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
1850                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
1851                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
1852                        ) {
1853                            ++source;
1854                            *target++=c;
1855                            if(offsets!=NULL) {
1856                                *offsets++=sourceIndex;
1857                                sourceIndex=(nextSourceIndex+=2);
1858                            }
1859                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1860                            offset=0;
1861                        } else {
1862                            /* set the state and leave the optimized loop */
1863                            ++nextSourceIndex;
1864                            bytes[0]=*(source-1);
1865                            byteIndex=1;
1866                            break;
1867                        }
1868                    } else {
1869                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1870                            /* output BMP code point */
1871                            ++source;
1872                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1873                            if(offsets!=NULL) {
1874                                *offsets++=sourceIndex;
1875                                sourceIndex=++nextSourceIndex;
1876                            }
1877                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1878                        } else {
1879                            /* leave the optimized loop */
1880                            break;
1881                        }
1882                    }
1883                } while(source<sourceLimit && target<targetLimit);
1884            }
1885
1886            /*
1887             * these tests and break statements could be put inside the loop
1888             * if C had "break outerLoop" like Java
1889             */
1890            if(source>=sourceLimit) {
1891                break;
1892            }
1893            if(target>=targetLimit) {
1894                /* target is full */
1895                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1896                break;
1897            }
1898
1899            ++nextSourceIndex;
1900            bytes[byteIndex++]=*source++;
1901        } else /* byteIndex>0 */ {
1902            ++nextSourceIndex;
1903            entry=stateTable[state][bytes[byteIndex++]=*source++];
1904        }
1905
1906        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1907            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1908            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1909            continue;
1910        }
1911
1912        /* save the previous state for proper extension mapping with SI/SO-stateful converters */
1913        cnv->mode=state;
1914
1915        /* set the next state early so that we can reuse the entry variable */
1916        state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1917
1918        /*
1919         * An if-else-if chain provides more reliable performance for
1920         * the most common cases compared to a switch.
1921         */
1922        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1923        if(action==MBCS_STATE_VALID_16) {
1924            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1925            c=unicodeCodeUnits[offset];
1926            if(c<0xfffe) {
1927                /* output BMP code point */
1928                *target++=c;
1929                if(offsets!=NULL) {
1930                    *offsets++=sourceIndex;
1931                }
1932                byteIndex=0;
1933            } else if(c==0xfffe) {
1934                if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
1935                    /* output fallback BMP code point */
1936                    *target++=(UChar)entry;
1937                    if(offsets!=NULL) {
1938                        *offsets++=sourceIndex;
1939                    }
1940                    byteIndex=0;
1941                }
1942            } else {
1943                /* callback(illegal) */
1944                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1945            }
1946        } else if(action==MBCS_STATE_VALID_DIRECT_16) {
1947            /* output BMP code point */
1948            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1949            if(offsets!=NULL) {
1950                *offsets++=sourceIndex;
1951            }
1952            byteIndex=0;
1953        } else if(action==MBCS_STATE_VALID_16_PAIR) {
1954            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1955            c=unicodeCodeUnits[offset++];
1956            if(c<0xd800) {
1957                /* output BMP code point below 0xd800 */
1958                *target++=c;
1959                if(offsets!=NULL) {
1960                    *offsets++=sourceIndex;
1961                }
1962                byteIndex=0;
1963            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
1964                /* output roundtrip or fallback surrogate pair */
1965                *target++=(UChar)(c&0xdbff);
1966                if(offsets!=NULL) {
1967                    *offsets++=sourceIndex;
1968                }
1969                byteIndex=0;
1970                if(target<targetLimit) {
1971                    *target++=unicodeCodeUnits[offset];
1972                    if(offsets!=NULL) {
1973                        *offsets++=sourceIndex;
1974                    }
1975                } else {
1976                    /* target overflow */
1977                    cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
1978                    cnv->UCharErrorBufferLength=1;
1979                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1980
1981                    offset=0;
1982                    break;
1983                }
1984            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1985                /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1986                *target++=unicodeCodeUnits[offset];
1987                if(offsets!=NULL) {
1988                    *offsets++=sourceIndex;
1989                }
1990                byteIndex=0;
1991            } else if(c==0xffff) {
1992                /* callback(illegal) */
1993                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1994            }
1995        } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
1996                  (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1997        ) {
1998            entry=MBCS_ENTRY_FINAL_VALUE(entry);
1999            /* output surrogate pair */
2000            *target++=(UChar)(0xd800|(UChar)(entry>>10));
2001            if(offsets!=NULL) {
2002                *offsets++=sourceIndex;
2003            }
2004            byteIndex=0;
2005            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2006            if(target<targetLimit) {
2007                *target++=c;
2008                if(offsets!=NULL) {
2009                    *offsets++=sourceIndex;
2010                }
2011            } else {
2012                /* target overflow */
2013                cnv->UCharErrorBuffer[0]=c;
2014                cnv->UCharErrorBufferLength=1;
2015                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2016
2017                offset=0;
2018                break;
2019            }
2020        } else if(action==MBCS_STATE_CHANGE_ONLY) {
2021            /*
2022             * This serves as a state change without any output.
2023             * It is useful for reading simple stateful encodings,
2024             * for example using just Shift-In/Shift-Out codes.
2025             * The 21 unused bits may later be used for more sophisticated
2026             * state transitions.
2027             */
2028            if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2029                byteIndex=0;
2030            } else {
2031                /* SI/SO are illegal for DBCS-only conversion */
2032                state=(uint8_t)(cnv->mode); /* restore the previous state */
2033
2034                /* callback(illegal) */
2035                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2036            }
2037        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2038            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2039                /* output BMP code point */
2040                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2041                if(offsets!=NULL) {
2042                    *offsets++=sourceIndex;
2043                }
2044                byteIndex=0;
2045            }
2046        } else if(action==MBCS_STATE_UNASSIGNED) {
2047            /* just fall through */
2048        } else if(action==MBCS_STATE_ILLEGAL) {
2049            /* callback(illegal) */
2050            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2051        } else {
2052            /* reserved, must never occur */
2053            byteIndex=0;
2054        }
2055
2056        /* end of action codes: prepare for a new character */
2057        offset=0;
2058
2059        if(byteIndex==0) {
2060            sourceIndex=nextSourceIndex;
2061        } else if(U_FAILURE(*pErrorCode)) {
2062            /* callback(illegal) */
2063            break;
2064        } else /* unassigned sequences indicated with byteIndex>0 */ {
2065            /* try an extension mapping */
2066            pArgs->source=(const char *)source;
2067            byteIndex=_extToU(cnv, cnv->sharedData,
2068                              byteIndex, &source, sourceLimit,
2069                              &target, targetLimit,
2070                              &offsets, sourceIndex,
2071                              pArgs->flush,
2072                              pErrorCode);
2073            sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
2074
2075            if(U_FAILURE(*pErrorCode)) {
2076                /* not mappable or buffer overflow */
2077                break;
2078            }
2079        }
2080    }
2081
2082    /* set the converter state back into UConverter */
2083    cnv->toUnicodeStatus=offset;
2084    cnv->mode=state;
2085    cnv->toULength=byteIndex;
2086
2087    /* write back the updated pointers */
2088    pArgs->source=(const char *)source;
2089    pArgs->target=target;
2090    pArgs->offsets=offsets;
2091}
2092
2093/*
2094 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2095 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2096 */
2097static UChar32
2098ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2099                        UErrorCode *pErrorCode) {
2100    UConverter *cnv;
2101    const int32_t (*stateTable)[256];
2102    const uint8_t *source, *sourceLimit;
2103
2104    int32_t entry;
2105    uint8_t action;
2106
2107    /* set up the local pointers */
2108    cnv=pArgs->converter;
2109    source=(const uint8_t *)pArgs->source;
2110    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2111    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2112        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2113    } else {
2114        stateTable=cnv->sharedData->mbcs.stateTable;
2115    }
2116
2117    /* conversion loop */
2118    while(source<sourceLimit) {
2119        entry=stateTable[0][*source++];
2120        /* MBCS_ENTRY_IS_FINAL(entry) */
2121
2122        /* write back the updated pointer early so that we can return directly */
2123        pArgs->source=(const char *)source;
2124
2125        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2126            /* output BMP code point */
2127            return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2128        }
2129
2130        /*
2131         * An if-else-if chain provides more reliable performance for
2132         * the most common cases compared to a switch.
2133         */
2134        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2135        if( action==MBCS_STATE_VALID_DIRECT_20 ||
2136            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2137        ) {
2138            /* output supplementary code point */
2139            return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2140        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2141            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2142                /* output BMP code point */
2143                return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2144            }
2145        } else if(action==MBCS_STATE_UNASSIGNED) {
2146            /* just fall through */
2147        } else if(action==MBCS_STATE_ILLEGAL) {
2148            /* callback(illegal) */
2149            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2150        } else {
2151            /* reserved, must never occur */
2152            continue;
2153        }
2154
2155        if(U_FAILURE(*pErrorCode)) {
2156            /* callback(illegal) */
2157            break;
2158        } else /* unassigned sequence */ {
2159            /* defer to the generic implementation */
2160            pArgs->source=(const char *)source-1;
2161            return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2162        }
2163    }
2164
2165    /* no output because of empty input or only state changes */
2166    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2167    return 0xffff;
2168}
2169
2170/*
2171 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2172 * conversion without offset handling.
2173 *
2174 * When a character does not have a mapping to Unicode, then we return to the
2175 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2176 * handling.
2177 * We also defer to the generic code in other complicated cases and have them
2178 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2179 *
2180 * All normal mappings and errors are handled here.
2181 */
2182static UChar32
2183ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2184                  UErrorCode *pErrorCode) {
2185    UConverter *cnv;
2186    const uint8_t *source, *sourceLimit, *lastSource;
2187
2188    const int32_t (*stateTable)[256];
2189    const uint16_t *unicodeCodeUnits;
2190
2191    uint32_t offset;
2192    uint8_t state;
2193
2194    int32_t entry;
2195    UChar32 c;
2196    uint8_t action;
2197
2198    /* use optimized function if possible */
2199    cnv=pArgs->converter;
2200
2201    if(cnv->preToULength>0) {
2202        /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2203        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2204    }
2205
2206    if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2207        /*
2208         * Using the generic ucnv_getNextUChar() code lets us deal correctly
2209         * with the rare case of a codepage that maps single surrogates
2210         * without adding the complexity to this already complicated function here.
2211         */
2212        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2213    } else if(cnv->sharedData->mbcs.countStates==1) {
2214        return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2215    }
2216
2217    /* set up the local pointers */
2218    source=lastSource=(const uint8_t *)pArgs->source;
2219    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2220
2221    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2222        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2223    } else {
2224        stateTable=cnv->sharedData->mbcs.stateTable;
2225    }
2226    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2227
2228    /* get the converter state from UConverter */
2229    offset=cnv->toUnicodeStatus;
2230
2231    /*
2232     * if we are in the SBCS state for a DBCS-only converter,
2233     * then load the DBCS state from the MBCS data
2234     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2235     */
2236    if((state=(uint8_t)(cnv->mode))==0) {
2237        state=cnv->sharedData->mbcs.dbcsOnlyState;
2238    }
2239
2240    /* conversion loop */
2241    c=U_SENTINEL;
2242    while(source<sourceLimit) {
2243        entry=stateTable[state][*source++];
2244        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2245            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2246            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2247
2248            /* optimization for 1/2-byte input and BMP output */
2249            if( source<sourceLimit &&
2250                MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2251                MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2252                (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2253            ) {
2254                ++source;
2255                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2256                /* output BMP code point */
2257                break;
2258            }
2259        } else {
2260            /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2261            cnv->mode=state;
2262
2263            /* set the next state early so that we can reuse the entry variable */
2264            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2265
2266            /*
2267             * An if-else-if chain provides more reliable performance for
2268             * the most common cases compared to a switch.
2269             */
2270            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2271            if(action==MBCS_STATE_VALID_DIRECT_16) {
2272                /* output BMP code point */
2273                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2274                break;
2275            } else if(action==MBCS_STATE_VALID_16) {
2276                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2277                c=unicodeCodeUnits[offset];
2278                if(c<0xfffe) {
2279                    /* output BMP code point */
2280                    break;
2281                } else if(c==0xfffe) {
2282                    if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2283                        break;
2284                    }
2285                } else {
2286                    /* callback(illegal) */
2287                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2288                }
2289            } else if(action==MBCS_STATE_VALID_16_PAIR) {
2290                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2291                c=unicodeCodeUnits[offset++];
2292                if(c<0xd800) {
2293                    /* output BMP code point below 0xd800 */
2294                    break;
2295                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2296                    /* output roundtrip or fallback supplementary code point */
2297                    c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2298                    break;
2299                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2300                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2301                    c=unicodeCodeUnits[offset];
2302                    break;
2303                } else if(c==0xffff) {
2304                    /* callback(illegal) */
2305                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2306                }
2307            } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2308                      (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2309            ) {
2310                /* output supplementary code point */
2311                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2312                break;
2313            } else if(action==MBCS_STATE_CHANGE_ONLY) {
2314                /*
2315                 * This serves as a state change without any output.
2316                 * It is useful for reading simple stateful encodings,
2317                 * for example using just Shift-In/Shift-Out codes.
2318                 * The 21 unused bits may later be used for more sophisticated
2319                 * state transitions.
2320                 */
2321                if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2322                    /* SI/SO are illegal for DBCS-only conversion */
2323                    state=(uint8_t)(cnv->mode); /* restore the previous state */
2324
2325                    /* callback(illegal) */
2326                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2327                }
2328            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2329                if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2330                    /* output BMP code point */
2331                    c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2332                    break;
2333                }
2334            } else if(action==MBCS_STATE_UNASSIGNED) {
2335                /* just fall through */
2336            } else if(action==MBCS_STATE_ILLEGAL) {
2337                /* callback(illegal) */
2338                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2339            } else {
2340                /* reserved (must never occur), or only state change */
2341                offset=0;
2342                lastSource=source;
2343                continue;
2344            }
2345
2346            /* end of action codes: prepare for a new character */
2347            offset=0;
2348
2349            if(U_FAILURE(*pErrorCode)) {
2350                /* callback(illegal) */
2351                break;
2352            } else /* unassigned sequence */ {
2353                /* defer to the generic implementation */
2354                cnv->toUnicodeStatus=0;
2355                cnv->mode=state;
2356                pArgs->source=(const char *)lastSource;
2357                return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2358            }
2359        }
2360    }
2361
2362    if(c<0) {
2363        if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
2364            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2365        }
2366        if(U_FAILURE(*pErrorCode)) {
2367            /* incomplete character byte sequence */
2368            uint8_t *bytes=cnv->toUBytes;
2369            cnv->toULength=(int8_t)(source-lastSource);
2370            do {
2371                *bytes++=*lastSource++;
2372            } while(lastSource<source);
2373        } else {
2374            /* no output because of empty input or only state changes */
2375            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2376        }
2377        c=0xffff;
2378    }
2379
2380    /* set the converter state back into UConverter, ready for a new character */
2381    cnv->toUnicodeStatus=0;
2382    cnv->mode=state;
2383
2384    /* write back the updated pointer */
2385    pArgs->source=(const char *)source;
2386    return c;
2387}
2388
2389#if 0
2390/*
2391 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2392 * Removal improves code coverage.
2393 */
2394/**
2395 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2396 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2397 * It does not handle conversion extensions (_extToU()).
2398 */
2399U_CFUNC UChar32
2400ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2401                              uint8_t b, UBool useFallback) {
2402    int32_t entry;
2403    uint8_t action;
2404
2405    entry=sharedData->mbcs.stateTable[0][b];
2406    /* MBCS_ENTRY_IS_FINAL(entry) */
2407
2408    if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2409        /* output BMP code point */
2410        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2411    }
2412
2413    /*
2414     * An if-else-if chain provides more reliable performance for
2415     * the most common cases compared to a switch.
2416     */
2417    action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2418    if(action==MBCS_STATE_VALID_DIRECT_20) {
2419        /* output supplementary code point */
2420        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2421    } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2422        if(!TO_U_USE_FALLBACK(useFallback)) {
2423            return 0xfffe;
2424        }
2425        /* output BMP code point */
2426        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2427    } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2428        if(!TO_U_USE_FALLBACK(useFallback)) {
2429            return 0xfffe;
2430        }
2431        /* output supplementary code point */
2432        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2433    } else if(action==MBCS_STATE_UNASSIGNED) {
2434        return 0xfffe;
2435    } else if(action==MBCS_STATE_ILLEGAL) {
2436        return 0xffff;
2437    } else {
2438        /* reserved, must never occur */
2439        return 0xffff;
2440    }
2441}
2442#endif
2443
2444/*
2445 * This is a simple version of _MBCSGetNextUChar() that is used
2446 * by other converter implementations.
2447 * It only returns an "assigned" result if it consumes the entire input.
2448 * It does not use state from the converter, nor error codes.
2449 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2450 * It handles conversion extensions but not GB 18030.
2451 *
2452 * Return value:
2453 * U+fffe   unassigned
2454 * U+ffff   illegal
2455 * otherwise the Unicode code point
2456 */
2457U_CFUNC UChar32
2458ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
2459                        const char *source, int32_t length,
2460                        UBool useFallback) {
2461    const int32_t (*stateTable)[256];
2462    const uint16_t *unicodeCodeUnits;
2463
2464    uint32_t offset;
2465    uint8_t state, action;
2466
2467    UChar32 c;
2468    int32_t i, entry;
2469
2470    if(length<=0) {
2471        /* no input at all: "illegal" */
2472        return 0xffff;
2473    }
2474
2475#if 0
2476/*
2477 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2478 * TODO In future releases, verify that this function is never called for SBCS
2479 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
2480 * Removal improves code coverage.
2481 */
2482    /* use optimized function if possible */
2483    if(sharedData->mbcs.countStates==1) {
2484        if(length==1) {
2485            return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
2486        } else {
2487            return 0xffff; /* illegal: more than a single byte for an SBCS converter */
2488        }
2489    }
2490#endif
2491
2492    /* set up the local pointers */
2493    stateTable=sharedData->mbcs.stateTable;
2494    unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
2495
2496    /* converter state */
2497    offset=0;
2498    state=sharedData->mbcs.dbcsOnlyState;
2499
2500    /* conversion loop */
2501    for(i=0;;) {
2502        entry=stateTable[state][(uint8_t)source[i++]];
2503        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2504            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2505            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2506
2507            if(i==length) {
2508                return 0xffff; /* truncated character */
2509            }
2510        } else {
2511            /*
2512             * An if-else-if chain provides more reliable performance for
2513             * the most common cases compared to a switch.
2514             */
2515            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2516            if(action==MBCS_STATE_VALID_16) {
2517                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2518                c=unicodeCodeUnits[offset];
2519                if(c!=0xfffe) {
2520                    /* done */
2521                } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2522                    c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
2523                /* else done with 0xfffe */
2524                }
2525                break;
2526            } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2527                /* output BMP code point */
2528                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2529                break;
2530            } else if(action==MBCS_STATE_VALID_16_PAIR) {
2531                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2532                c=unicodeCodeUnits[offset++];
2533                if(c<0xd800) {
2534                    /* output BMP code point below 0xd800 */
2535                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2536                    /* output roundtrip or fallback supplementary code point */
2537                    c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
2538                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2539                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2540                    c=unicodeCodeUnits[offset];
2541                } else if(c==0xffff) {
2542                    return 0xffff;
2543                } else {
2544                    c=0xfffe;
2545                }
2546                break;
2547            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
2548                /* output supplementary code point */
2549                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2550                break;
2551            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2552                if(!TO_U_USE_FALLBACK(useFallback)) {
2553                    c=0xfffe;
2554                    break;
2555                }
2556                /* output BMP code point */
2557                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2558                break;
2559            } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2560                if(!TO_U_USE_FALLBACK(useFallback)) {
2561                    c=0xfffe;
2562                    break;
2563                }
2564                /* output supplementary code point */
2565                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2566                break;
2567            } else if(action==MBCS_STATE_UNASSIGNED) {
2568                c=0xfffe;
2569                break;
2570            }
2571
2572            /*
2573             * forbid MBCS_STATE_CHANGE_ONLY for this function,
2574             * and MBCS_STATE_ILLEGAL and reserved action codes
2575             */
2576            return 0xffff;
2577        }
2578    }
2579
2580    if(i!=length) {
2581        /* illegal for this function: not all input consumed */
2582        return 0xffff;
2583    }
2584
2585    if(c==0xfffe) {
2586        /* try an extension mapping */
2587        const int32_t *cx=sharedData->mbcs.extIndexes;
2588        if(cx!=NULL) {
2589            return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
2590        }
2591    }
2592
2593    return c;
2594}
2595
2596/* MBCS-from-Unicode conversion functions ----------------------------------- */
2597
2598/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
2599static void
2600ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2601                                  UErrorCode *pErrorCode) {
2602    UConverter *cnv;
2603    const UChar *source, *sourceLimit;
2604    uint8_t *target;
2605    int32_t targetCapacity;
2606    int32_t *offsets;
2607
2608    const uint16_t *table;
2609    const uint16_t *mbcsIndex;
2610    const uint8_t *bytes;
2611
2612    UChar32 c;
2613
2614    int32_t sourceIndex, nextSourceIndex;
2615
2616    uint32_t stage2Entry;
2617    uint32_t asciiRoundtrips;
2618    uint32_t value;
2619    uint8_t unicodeMask;
2620
2621    /* use optimized function if possible */
2622    cnv=pArgs->converter;
2623    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
2624
2625    /* set up the local pointers */
2626    source=pArgs->source;
2627    sourceLimit=pArgs->sourceLimit;
2628    target=(uint8_t *)pArgs->target;
2629    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
2630    offsets=pArgs->offsets;
2631
2632    table=cnv->sharedData->mbcs.fromUnicodeTable;
2633    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
2634    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2635        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
2636    } else {
2637        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
2638    }
2639    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
2640
2641    /* get the converter state from UConverter */
2642    c=cnv->fromUChar32;
2643
2644    /* sourceIndex=-1 if the current character began in the previous buffer */
2645    sourceIndex= c==0 ? 0 : -1;
2646    nextSourceIndex=0;
2647
2648    /* conversion loop */
2649    if(c!=0 && targetCapacity>0) {
2650        goto getTrail;
2651    }
2652
2653    while(source<sourceLimit) {
2654        /*
2655         * This following test is to see if available input would overflow the output.
2656         * It does not catch output of more than one byte that
2657         * overflows as a result of a multi-byte character or callback output
2658         * from the last source character.
2659         * Therefore, those situations also test for overflows and will
2660         * then break the loop, too.
2661         */
2662        if(targetCapacity>0) {
2663            /*
2664             * Get a correct Unicode code point:
2665             * a single UChar for a BMP code point or
2666             * a matched surrogate pair for a "supplementary code point".
2667             */
2668            c=*source++;
2669            ++nextSourceIndex;
2670            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
2671                *target++=(uint8_t)c;
2672                if(offsets!=NULL) {
2673                    *offsets++=sourceIndex;
2674                    sourceIndex=nextSourceIndex;
2675                }
2676                --targetCapacity;
2677                c=0;
2678                continue;
2679            }
2680            /*
2681             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
2682             * to avoid dealing with surrogates.
2683             * MBCS_FAST_MAX must be >=0xd7ff.
2684             */
2685            if(c<=0xd7ff) {
2686                value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
2687                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
2688                if(value==0) {
2689                    goto unassigned;
2690                }
2691                /* output the value */
2692            } else {
2693                /*
2694                 * This also tests if the codepage maps single surrogates.
2695                 * If it does, then surrogates are not paired but mapped separately.
2696                 * Note that in this case unmatched surrogates are not detected.
2697                 */
2698                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2699                    if(UTF_IS_SURROGATE_FIRST(c)) {
2700getTrail:
2701                        if(source<sourceLimit) {
2702                            /* test the following code unit */
2703                            UChar trail=*source;
2704                            if(UTF_IS_SECOND_SURROGATE(trail)) {
2705                                ++source;
2706                                ++nextSourceIndex;
2707                                c=UTF16_GET_PAIR_VALUE(c, trail);
2708                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2709                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2710                                    /* callback(unassigned) */
2711                                    goto unassigned;
2712                                }
2713                                /* convert this supplementary code point */
2714                                /* exit this condition tree */
2715                            } else {
2716                                /* this is an unmatched lead code unit (1st surrogate) */
2717                                /* callback(illegal) */
2718                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2719                                break;
2720                            }
2721                        } else {
2722                            /* no more input */
2723                            break;
2724                        }
2725                    } else {
2726                        /* this is an unmatched trail code unit (2nd surrogate) */
2727                        /* callback(illegal) */
2728                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2729                        break;
2730                    }
2731                }
2732
2733                /* convert the Unicode code point in c into codepage bytes */
2734                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2735
2736                /* get the bytes and the length for the output */
2737                /* MBCS_OUTPUT_2 */
2738                value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2739
2740                /* is this code point assigned, or do we use fallbacks? */
2741                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
2742                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
2743                ) {
2744                    /*
2745                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
2746                     * There is no way with this data structure for fallback output
2747                     * to be a zero byte.
2748                     */
2749
2750unassigned:
2751                    /* try an extension mapping */
2752                    pArgs->source=source;
2753                    c=_extFromU(cnv, cnv->sharedData,
2754                                c, &source, sourceLimit,
2755                                &target, target+targetCapacity,
2756                                &offsets, sourceIndex,
2757                                pArgs->flush,
2758                                pErrorCode);
2759                    nextSourceIndex+=(int32_t)(source-pArgs->source);
2760
2761                    if(U_FAILURE(*pErrorCode)) {
2762                        /* not mappable or buffer overflow */
2763                        break;
2764                    } else {
2765                        /* a mapping was written to the target, continue */
2766
2767                        /* recalculate the targetCapacity after an extension mapping */
2768                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
2769
2770                        /* normal end of conversion: prepare for a new character */
2771                        sourceIndex=nextSourceIndex;
2772                        continue;
2773                    }
2774                }
2775            }
2776
2777            /* write the output character bytes from value and length */
2778            /* from the first if in the loop we know that targetCapacity>0 */
2779            if(value<=0xff) {
2780                /* this is easy because we know that there is enough space */
2781                *target++=(uint8_t)value;
2782                if(offsets!=NULL) {
2783                    *offsets++=sourceIndex;
2784                }
2785                --targetCapacity;
2786            } else /* length==2 */ {
2787                *target++=(uint8_t)(value>>8);
2788                if(2<=targetCapacity) {
2789                    *target++=(uint8_t)value;
2790                    if(offsets!=NULL) {
2791                        *offsets++=sourceIndex;
2792                        *offsets++=sourceIndex;
2793                    }
2794                    targetCapacity-=2;
2795                } else {
2796                    if(offsets!=NULL) {
2797                        *offsets++=sourceIndex;
2798                    }
2799                    cnv->charErrorBuffer[0]=(char)value;
2800                    cnv->charErrorBufferLength=1;
2801
2802                    /* target overflow */
2803                    targetCapacity=0;
2804                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2805                    c=0;
2806                    break;
2807                }
2808            }
2809
2810            /* normal end of conversion: prepare for a new character */
2811            c=0;
2812            sourceIndex=nextSourceIndex;
2813            continue;
2814        } else {
2815            /* target is full */
2816            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2817            break;
2818        }
2819    }
2820
2821    /* set the converter state back into UConverter */
2822    cnv->fromUChar32=c;
2823
2824    /* write back the updated pointers */
2825    pArgs->source=source;
2826    pArgs->target=(char *)target;
2827    pArgs->offsets=offsets;
2828}
2829
2830/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
2831static void
2832ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2833                                  UErrorCode *pErrorCode) {
2834    UConverter *cnv;
2835    const UChar *source, *sourceLimit;
2836    uint8_t *target;
2837    int32_t targetCapacity;
2838    int32_t *offsets;
2839
2840    const uint16_t *table;
2841    const uint16_t *results;
2842
2843    UChar32 c;
2844
2845    int32_t sourceIndex, nextSourceIndex;
2846
2847    uint16_t value, minValue;
2848    UBool hasSupplementary;
2849
2850    /* set up the local pointers */
2851    cnv=pArgs->converter;
2852    source=pArgs->source;
2853    sourceLimit=pArgs->sourceLimit;
2854    target=(uint8_t *)pArgs->target;
2855    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
2856    offsets=pArgs->offsets;
2857
2858    table=cnv->sharedData->mbcs.fromUnicodeTable;
2859    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2860        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
2861    } else {
2862        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
2863    }
2864
2865    if(cnv->useFallback) {
2866        /* use all roundtrip and fallback results */
2867        minValue=0x800;
2868    } else {
2869        /* use only roundtrips and fallbacks from private-use characters */
2870        minValue=0xc00;
2871    }
2872    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
2873
2874    /* get the converter state from UConverter */
2875    c=cnv->fromUChar32;
2876
2877    /* sourceIndex=-1 if the current character began in the previous buffer */
2878    sourceIndex= c==0 ? 0 : -1;
2879    nextSourceIndex=0;
2880
2881    /* conversion loop */
2882    if(c!=0 && targetCapacity>0) {
2883        goto getTrail;
2884    }
2885
2886    while(source<sourceLimit) {
2887        /*
2888         * This following test is to see if available input would overflow the output.
2889         * It does not catch output of more than one byte that
2890         * overflows as a result of a multi-byte character or callback output
2891         * from the last source character.
2892         * Therefore, those situations also test for overflows and will
2893         * then break the loop, too.
2894         */
2895        if(targetCapacity>0) {
2896            /*
2897             * Get a correct Unicode code point:
2898             * a single UChar for a BMP code point or
2899             * a matched surrogate pair for a "supplementary code point".
2900             */
2901            c=*source++;
2902            ++nextSourceIndex;
2903            if(UTF_IS_SURROGATE(c)) {
2904                if(UTF_IS_SURROGATE_FIRST(c)) {
2905getTrail:
2906                    if(source<sourceLimit) {
2907                        /* test the following code unit */
2908                        UChar trail=*source;
2909                        if(UTF_IS_SECOND_SURROGATE(trail)) {
2910                            ++source;
2911                            ++nextSourceIndex;
2912                            c=UTF16_GET_PAIR_VALUE(c, trail);
2913                            if(!hasSupplementary) {
2914                                /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2915                                /* callback(unassigned) */
2916                                goto unassigned;
2917                            }
2918                            /* convert this supplementary code point */
2919                            /* exit this condition tree */
2920                        } else {
2921                            /* this is an unmatched lead code unit (1st surrogate) */
2922                            /* callback(illegal) */
2923                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2924                            break;
2925                        }
2926                    } else {
2927                        /* no more input */
2928                        break;
2929                    }
2930                } else {
2931                    /* this is an unmatched trail code unit (2nd surrogate) */
2932                    /* callback(illegal) */
2933                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2934                    break;
2935                }
2936            }
2937
2938            /* convert the Unicode code point in c into codepage bytes */
2939            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2940
2941            /* is this code point assigned, or do we use fallbacks? */
2942            if(value>=minValue) {
2943                /* assigned, write the output character bytes from value and length */
2944                /* length==1 */
2945                /* this is easy because we know that there is enough space */
2946                *target++=(uint8_t)value;
2947                if(offsets!=NULL) {
2948                    *offsets++=sourceIndex;
2949                }
2950                --targetCapacity;
2951
2952                /* normal end of conversion: prepare for a new character */
2953                c=0;
2954                sourceIndex=nextSourceIndex;
2955            } else { /* unassigned */
2956unassigned:
2957                /* try an extension mapping */
2958                pArgs->source=source;
2959                c=_extFromU(cnv, cnv->sharedData,
2960                            c, &source, sourceLimit,
2961                            &target, target+targetCapacity,
2962                            &offsets, sourceIndex,
2963                            pArgs->flush,
2964                            pErrorCode);
2965                nextSourceIndex+=(int32_t)(source-pArgs->source);
2966
2967                if(U_FAILURE(*pErrorCode)) {
2968                    /* not mappable or buffer overflow */
2969                    break;
2970                } else {
2971                    /* a mapping was written to the target, continue */
2972
2973                    /* recalculate the targetCapacity after an extension mapping */
2974                    targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
2975
2976                    /* normal end of conversion: prepare for a new character */
2977                    sourceIndex=nextSourceIndex;
2978                }
2979            }
2980        } else {
2981            /* target is full */
2982            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2983            break;
2984        }
2985    }
2986
2987    /* set the converter state back into UConverter */
2988    cnv->fromUChar32=c;
2989
2990    /* write back the updated pointers */
2991    pArgs->source=source;
2992    pArgs->target=(char *)target;
2993    pArgs->offsets=offsets;
2994}
2995
2996/*
2997 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
2998 * that map only to and from the BMP.
2999 * In addition to single-byte/state optimizations, the offset calculations
3000 * become much easier.
3001 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3002 * but measurements have shown that this diminishes performance
3003 * in more cases than it improves it.
3004 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3005 * for various MBCS and SBCS optimizations.
3006 */
3007static void
3008ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3009                              UErrorCode *pErrorCode) {
3010    UConverter *cnv;
3011    const UChar *source, *sourceLimit, *lastSource;
3012    uint8_t *target;
3013    int32_t targetCapacity, length;
3014    int32_t *offsets;
3015
3016    const uint16_t *table;
3017    const uint16_t *results;
3018
3019    UChar32 c;
3020
3021    int32_t sourceIndex;
3022
3023    uint32_t asciiRoundtrips;
3024    uint16_t value, minValue;
3025
3026    /* set up the local pointers */
3027    cnv=pArgs->converter;
3028    source=pArgs->source;
3029    sourceLimit=pArgs->sourceLimit;
3030    target=(uint8_t *)pArgs->target;
3031    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3032    offsets=pArgs->offsets;
3033
3034    table=cnv->sharedData->mbcs.fromUnicodeTable;
3035    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3036        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3037    } else {
3038        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3039    }
3040    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3041
3042    if(cnv->useFallback) {
3043        /* use all roundtrip and fallback results */
3044        minValue=0x800;
3045    } else {
3046        /* use only roundtrips and fallbacks from private-use characters */
3047        minValue=0xc00;
3048    }
3049
3050    /* get the converter state from UConverter */
3051    c=cnv->fromUChar32;
3052
3053    /* sourceIndex=-1 if the current character began in the previous buffer */
3054    sourceIndex= c==0 ? 0 : -1;
3055    lastSource=source;
3056
3057    /*
3058     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3059     * for the minimum of the sourceLength and targetCapacity
3060     */
3061    length=(int32_t)(sourceLimit-source);
3062    if(length<targetCapacity) {
3063        targetCapacity=length;
3064    }
3065
3066    /* conversion loop */
3067    if(c!=0 && targetCapacity>0) {
3068        goto getTrail;
3069    }
3070
3071#if MBCS_UNROLL_SINGLE_FROM_BMP
3072    /* unrolling makes it slower on Pentium III/Windows 2000?! */
3073    /* unroll the loop with the most common case */
3074unrolled:
3075    if(targetCapacity>=4) {
3076        int32_t count, loops;
3077        uint16_t andedValues;
3078
3079        loops=count=targetCapacity>>2;
3080        do {
3081            c=*source++;
3082            andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3083            *target++=(uint8_t)value;
3084            c=*source++;
3085            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3086            *target++=(uint8_t)value;
3087            c=*source++;
3088            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3089            *target++=(uint8_t)value;
3090            c=*source++;
3091            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3092            *target++=(uint8_t)value;
3093
3094            /* were all 4 entries really valid? */
3095            if(andedValues<minValue) {
3096                /* no, return to the first of these 4 */
3097                source-=4;
3098                target-=4;
3099                break;
3100            }
3101        } while(--count>0);
3102        count=loops-count;
3103        targetCapacity-=4*count;
3104
3105        if(offsets!=NULL) {
3106            lastSource+=4*count;
3107            while(count>0) {
3108                *offsets++=sourceIndex++;
3109                *offsets++=sourceIndex++;
3110                *offsets++=sourceIndex++;
3111                *offsets++=sourceIndex++;
3112                --count;
3113            }
3114        }
3115
3116        c=0;
3117    }
3118#endif
3119
3120    while(targetCapacity>0) {
3121        /*
3122         * Get a correct Unicode code point:
3123         * a single UChar for a BMP code point or
3124         * a matched surrogate pair for a "supplementary code point".
3125         */
3126        c=*source++;
3127        /*
3128         * Do not immediately check for single surrogates:
3129         * Assume that they are unassigned and check for them in that case.
3130         * This speeds up the conversion of assigned characters.
3131         */
3132        /* convert the Unicode code point in c into codepage bytes */
3133        if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3134            *target++=(uint8_t)c;
3135            --targetCapacity;
3136            c=0;
3137            continue;
3138        }
3139        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3140        /* is this code point assigned, or do we use fallbacks? */
3141        if(value>=minValue) {
3142            /* assigned, write the output character bytes from value and length */
3143            /* length==1 */
3144            /* this is easy because we know that there is enough space */
3145            *target++=(uint8_t)value;
3146            --targetCapacity;
3147
3148            /* normal end of conversion: prepare for a new character */
3149            c=0;
3150            continue;
3151        } else if(!UTF_IS_SURROGATE(c)) {
3152            /* normal, unassigned BMP character */
3153        } else if(UTF_IS_SURROGATE_FIRST(c)) {
3154getTrail:
3155            if(source<sourceLimit) {
3156                /* test the following code unit */
3157                UChar trail=*source;
3158                if(UTF_IS_SECOND_SURROGATE(trail)) {
3159                    ++source;
3160                    c=UTF16_GET_PAIR_VALUE(c, trail);
3161                    /* this codepage does not map supplementary code points */
3162                    /* callback(unassigned) */
3163                } else {
3164                    /* this is an unmatched lead code unit (1st surrogate) */
3165                    /* callback(illegal) */
3166                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3167                    break;
3168                }
3169            } else {
3170                /* no more input */
3171                if (pArgs->flush) {
3172                    *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3173                }
3174                break;
3175            }
3176        } else {
3177            /* this is an unmatched trail code unit (2nd surrogate) */
3178            /* callback(illegal) */
3179            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3180            break;
3181        }
3182
3183        /* c does not have a mapping */
3184
3185        /* get the number of code units for c to correctly advance sourceIndex */
3186        length=U16_LENGTH(c);
3187
3188        /* set offsets since the start or the last extension */
3189        if(offsets!=NULL) {
3190            int32_t count=(int32_t)(source-lastSource);
3191
3192            /* do not set the offset for this character */
3193            count-=length;
3194
3195            while(count>0) {
3196                *offsets++=sourceIndex++;
3197                --count;
3198            }
3199            /* offsets and sourceIndex are now set for the current character */
3200        }
3201
3202        /* try an extension mapping */
3203        lastSource=source;
3204        c=_extFromU(cnv, cnv->sharedData,
3205                    c, &source, sourceLimit,
3206                    &target, target+targetCapacity,
3207                    &offsets, sourceIndex,
3208                    pArgs->flush,
3209                    pErrorCode);
3210        sourceIndex+=length+(int32_t)(source-lastSource);
3211        lastSource=source;
3212
3213        if(U_FAILURE(*pErrorCode)) {
3214            /* not mappable or buffer overflow */
3215            break;
3216        } else {
3217            /* a mapping was written to the target, continue */
3218
3219            /* recalculate the targetCapacity after an extension mapping */
3220            targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3221            length=(int32_t)(sourceLimit-source);
3222            if(length<targetCapacity) {
3223                targetCapacity=length;
3224            }
3225        }
3226
3227#if MBCS_UNROLL_SINGLE_FROM_BMP
3228        /* unrolling makes it slower on Pentium III/Windows 2000?! */
3229        goto unrolled;
3230#endif
3231    }
3232
3233    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3234        /* target is full */
3235        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3236    }
3237
3238    /* set offsets since the start or the last callback */
3239    if(offsets!=NULL) {
3240        size_t count=source-lastSource;
3241        if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
3242            /*
3243            Caller gave us a partial supplementary character,
3244            which this function couldn't convert in any case.
3245            The callback will handle the offset.
3246            */
3247            count--;
3248        }
3249        while(count>0) {
3250            *offsets++=sourceIndex++;
3251            --count;
3252        }
3253    }
3254
3255    /* set the converter state back into UConverter */
3256    cnv->fromUChar32=c;
3257
3258    /* write back the updated pointers */
3259    pArgs->source=source;
3260    pArgs->target=(char *)target;
3261    pArgs->offsets=offsets;
3262}
3263
3264U_CFUNC void
3265ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3266                            UErrorCode *pErrorCode) {
3267    UConverter *cnv;
3268    const UChar *source, *sourceLimit;
3269    uint8_t *target;
3270    int32_t targetCapacity;
3271    int32_t *offsets;
3272
3273    const uint16_t *table;
3274    const uint16_t *mbcsIndex;
3275    const uint8_t *p, *bytes;
3276    uint8_t outputType;
3277
3278    UChar32 c;
3279
3280    int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3281
3282    uint32_t stage2Entry;
3283    uint32_t asciiRoundtrips;
3284    uint32_t value;
3285    int32_t length, prevLength;
3286    uint8_t unicodeMask;
3287
3288    cnv=pArgs->converter;
3289
3290    if(cnv->preFromUFirstCP>=0) {
3291        /*
3292         * pass sourceIndex=-1 because we continue from an earlier buffer
3293         * in the future, this may change with continuous offsets
3294         */
3295        ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3296
3297        if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3298            return;
3299        }
3300    }
3301
3302    /* use optimized function if possible */
3303    outputType=cnv->sharedData->mbcs.outputType;
3304    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3305    if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3306        if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3307            ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3308        } else {
3309            ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3310        }
3311        return;
3312    } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
3313        ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3314        return;
3315    }
3316
3317    /* set up the local pointers */
3318    source=pArgs->source;
3319    sourceLimit=pArgs->sourceLimit;
3320    target=(uint8_t *)pArgs->target;
3321    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3322    offsets=pArgs->offsets;
3323
3324    table=cnv->sharedData->mbcs.fromUnicodeTable;
3325    if(cnv->sharedData->mbcs.utf8Friendly) {
3326        mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3327    } else {
3328        mbcsIndex=NULL;
3329    }
3330    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3331        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3332    } else {
3333        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3334    }
3335    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3336
3337    /* get the converter state from UConverter */
3338    c=cnv->fromUChar32;
3339
3340    if(outputType==MBCS_OUTPUT_2_SISO) {
3341        prevLength=cnv->fromUnicodeStatus;
3342        if(prevLength==0) {
3343            /* set the real value */
3344            prevLength=1;
3345        }
3346    } else {
3347        /* prevent fromUnicodeStatus from being set to something non-0 */
3348        prevLength=0;
3349    }
3350
3351    /* sourceIndex=-1 if the current character began in the previous buffer */
3352    prevSourceIndex=-1;
3353    sourceIndex= c==0 ? 0 : -1;
3354    nextSourceIndex=0;
3355
3356    /* conversion loop */
3357    /*
3358     * This is another piece of ugly code:
3359     * A goto into the loop if the converter state contains a first surrogate
3360     * from the previous function call.
3361     * It saves me to check in each loop iteration a check of if(c==0)
3362     * and duplicating the trail-surrogate-handling code in the else
3363     * branch of that check.
3364     * I could not find any other way to get around this other than
3365     * using a function call for the conversion and callback, which would
3366     * be even more inefficient.
3367     *
3368     * Markus Scherer 2000-jul-19
3369     */
3370    if(c!=0 && targetCapacity>0) {
3371        goto getTrail;
3372    }
3373
3374    while(source<sourceLimit) {
3375        /*
3376         * This following test is to see if available input would overflow the output.
3377         * It does not catch output of more than one byte that
3378         * overflows as a result of a multi-byte character or callback output
3379         * from the last source character.
3380         * Therefore, those situations also test for overflows and will
3381         * then break the loop, too.
3382         */
3383        if(targetCapacity>0) {
3384            /*
3385             * Get a correct Unicode code point:
3386             * a single UChar for a BMP code point or
3387             * a matched surrogate pair for a "supplementary code point".
3388             */
3389            c=*source++;
3390            ++nextSourceIndex;
3391            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3392                *target++=(uint8_t)c;
3393                if(offsets!=NULL) {
3394                    *offsets++=sourceIndex;
3395                    prevSourceIndex=sourceIndex;
3396                    sourceIndex=nextSourceIndex;
3397                }
3398                --targetCapacity;
3399                c=0;
3400                continue;
3401            }
3402            /*
3403             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3404             * to avoid dealing with surrogates.
3405             * MBCS_FAST_MAX must be >=0xd7ff.
3406             */
3407            if(c<=0xd7ff && mbcsIndex!=NULL) {
3408                value=mbcsIndex[c>>6];
3409
3410                /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
3411                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3412                switch(outputType) {
3413                case MBCS_OUTPUT_2:
3414                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
3415                    if(value<=0xff) {
3416                        if(value==0) {
3417                            goto unassigned;
3418                        } else {
3419                            length=1;
3420                        }
3421                    } else {
3422                        length=2;
3423                    }
3424                    break;
3425                case MBCS_OUTPUT_2_SISO:
3426                    /* 1/2-byte stateful with Shift-In/Shift-Out */
3427                    /*
3428                     * Save the old state in the converter object
3429                     * right here, then change the local prevLength state variable if necessary.
3430                     * Then, if this character turns out to be unassigned or a fallback that
3431                     * is not taken, the callback code must not save the new state in the converter
3432                     * because the new state is for a character that is not output.
3433                     * However, the callback must still restore the state from the converter
3434                     * in case the callback function changed it for its output.
3435                     */
3436                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
3437                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
3438                    if(value<=0xff) {
3439                        if(value==0) {
3440                            goto unassigned;
3441                        } else if(prevLength<=1) {
3442                            length=1;
3443                        } else {
3444                            /* change from double-byte mode to single-byte */
3445                            value|=(uint32_t)UCNV_SI<<8;
3446                            length=2;
3447                            prevLength=1;
3448                        }
3449                    } else {
3450                        if(prevLength==2) {
3451                            length=2;
3452                        } else {
3453                            /* change from single-byte mode to double-byte */
3454                            value|=(uint32_t)UCNV_SO<<16;
3455                            length=3;
3456                            prevLength=2;
3457                        }
3458                    }
3459                    break;
3460                case MBCS_OUTPUT_DBCS_ONLY:
3461                    /* table with single-byte results, but only DBCS mappings used */
3462                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
3463                    if(value<=0xff) {
3464                        /* no mapping or SBCS result, not taken for DBCS-only */
3465                        goto unassigned;
3466                    } else {
3467                        length=2;
3468                    }
3469                    break;
3470                case MBCS_OUTPUT_3:
3471                    p=bytes+(value+(c&0x3f))*3;
3472                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3473                    if(value<=0xff) {
3474                        if(value==0) {
3475                            goto unassigned;
3476                        } else {
3477                            length=1;
3478                        }
3479                    } else if(value<=0xffff) {
3480                        length=2;
3481                    } else {
3482                        length=3;
3483                    }
3484                    break;
3485                case MBCS_OUTPUT_4:
3486                    value=((const uint32_t *)bytes)[value +(c&0x3f)];
3487                    if(value<=0xff) {
3488                        if(value==0) {
3489                            goto unassigned;
3490                        } else {
3491                            length=1;
3492                        }
3493                    } else if(value<=0xffff) {
3494                        length=2;
3495                    } else if(value<=0xffffff) {
3496                        length=3;
3497                    } else {
3498                        length=4;
3499                    }
3500                    break;
3501                case MBCS_OUTPUT_3_EUC:
3502                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
3503                    /* EUC 16-bit fixed-length representation */
3504                    if(value<=0xff) {
3505                        if(value==0) {
3506                            goto unassigned;
3507                        } else {
3508                            length=1;
3509                        }
3510                    } else if((value&0x8000)==0) {
3511                        value|=0x8e8000;
3512                        length=3;
3513                    } else if((value&0x80)==0) {
3514                        value|=0x8f0080;
3515                        length=3;
3516                    } else {
3517                        length=2;
3518                    }
3519                    break;
3520                case MBCS_OUTPUT_4_EUC:
3521                    p=bytes+(value+(c&0x3f))*3;
3522                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3523                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
3524                    if(value<=0xff) {
3525                        if(value==0) {
3526                            goto unassigned;
3527                        } else {
3528                            length=1;
3529                        }
3530                    } else if(value<=0xffff) {
3531                        length=2;
3532                    } else if((value&0x800000)==0) {
3533                        value|=0x8e800000;
3534                        length=4;
3535                    } else if((value&0x8000)==0) {
3536                        value|=0x8f008000;
3537                        length=4;
3538                    } else {
3539                        length=3;
3540                    }
3541                    break;
3542                default:
3543                    /* must not occur */
3544                    /*
3545                     * To avoid compiler warnings that value & length may be
3546                     * used without having been initialized, we set them here.
3547                     * In reality, this is unreachable code.
3548                     * Not having a default branch also causes warnings with
3549                     * some compilers.
3550                     */
3551                    value=0;
3552                    length=0;
3553                    break;
3554                }
3555                /* output the value */
3556            } else {
3557                /*
3558                 * This also tests if the codepage maps single surrogates.
3559                 * If it does, then surrogates are not paired but mapped separately.
3560                 * Note that in this case unmatched surrogates are not detected.
3561                 */
3562                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3563                    if(UTF_IS_SURROGATE_FIRST(c)) {
3564getTrail:
3565                        if(source<sourceLimit) {
3566                            /* test the following code unit */
3567                            UChar trail=*source;
3568                            if(UTF_IS_SECOND_SURROGATE(trail)) {
3569                                ++source;
3570                                ++nextSourceIndex;
3571                                c=UTF16_GET_PAIR_VALUE(c, trail);
3572                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3573                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3574                                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
3575                                    /* callback(unassigned) */
3576                                    goto unassigned;
3577                                }
3578                                /* convert this supplementary code point */
3579                                /* exit this condition tree */
3580                            } else {
3581                                /* this is an unmatched lead code unit (1st surrogate) */
3582                                /* callback(illegal) */
3583                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3584                                break;
3585                            }
3586                        } else {
3587                            /* no more input */
3588                            break;
3589                        }
3590                    } else {
3591                        /* this is an unmatched trail code unit (2nd surrogate) */
3592                        /* callback(illegal) */
3593                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3594                        break;
3595                    }
3596                }
3597
3598                /* convert the Unicode code point in c into codepage bytes */
3599
3600                /*
3601                 * The basic lookup is a triple-stage compact array (trie) lookup.
3602                 * For details see the beginning of this file.
3603                 *
3604                 * Single-byte codepages are handled with a different data structure
3605                 * by _MBCSSingle... functions.
3606                 *
3607                 * The result consists of a 32-bit value from stage 2 and
3608                 * a pointer to as many bytes as are stored per character.
3609                 * The pointer points to the character's bytes in stage 3.
3610                 * Bits 15..0 of the stage 2 entry contain the stage 3 index
3611                 * for that pointer, while bits 31..16 are flags for which of
3612                 * the 16 characters in the block are roundtrip-assigned.
3613                 *
3614                 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
3615                 * respectively as uint32_t, in the platform encoding.
3616                 * For 3-byte codepages, the bytes are always stored in big-endian order.
3617                 *
3618                 * For EUC encodings that use only either 0x8e or 0x8f as the first
3619                 * byte of their longest byte sequences, the first two bytes in
3620                 * this third stage indicate with their 7th bits whether these bytes
3621                 * are to be written directly or actually need to be preceeded by
3622                 * one of the two Single-Shift codes. With this, the third stage
3623                 * stores one byte fewer per character than the actual maximum length of
3624                 * EUC byte sequences.
3625                 *
3626                 * Other than that, leading zero bytes are removed and the other
3627                 * bytes output. A single zero byte may be output if the "assigned"
3628                 * bit in stage 2 was on.
3629                 * The data structure does not support zero byte output as a fallback,
3630                 * and also does not allow output of leading zeros.
3631                 */
3632                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3633
3634                /* get the bytes and the length for the output */
3635                switch(outputType) {
3636                case MBCS_OUTPUT_2:
3637                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3638                    if(value<=0xff) {
3639                        length=1;
3640                    } else {
3641                        length=2;
3642                    }
3643                    break;
3644                case MBCS_OUTPUT_2_SISO:
3645                    /* 1/2-byte stateful with Shift-In/Shift-Out */
3646                    /*
3647                     * Save the old state in the converter object
3648                     * right here, then change the local prevLength state variable if necessary.
3649                     * Then, if this character turns out to be unassigned or a fallback that
3650                     * is not taken, the callback code must not save the new state in the converter
3651                     * because the new state is for a character that is not output.
3652                     * However, the callback must still restore the state from the converter
3653                     * in case the callback function changed it for its output.
3654                     */
3655                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
3656                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3657                    if(value<=0xff) {
3658                        if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
3659                            /* no mapping, leave value==0 */
3660                            length=0;
3661                        } else if(prevLength<=1) {
3662                            length=1;
3663                        } else {
3664                            /* change from double-byte mode to single-byte */
3665                            value|=(uint32_t)UCNV_SI<<8;
3666                            length=2;
3667                            prevLength=1;
3668                        }
3669                    } else {
3670                        if(prevLength==2) {
3671                            length=2;
3672                        } else {
3673                            /* change from single-byte mode to double-byte */
3674                            value|=(uint32_t)UCNV_SO<<16;
3675                            length=3;
3676                            prevLength=2;
3677                        }
3678                    }
3679                    break;
3680                case MBCS_OUTPUT_DBCS_ONLY:
3681                    /* table with single-byte results, but only DBCS mappings used */
3682                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3683                    if(value<=0xff) {
3684                        /* no mapping or SBCS result, not taken for DBCS-only */
3685                        value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3686                        length=0;
3687                    } else {
3688                        length=2;
3689                    }
3690                    break;
3691                case MBCS_OUTPUT_3:
3692                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3693                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3694                    if(value<=0xff) {
3695                        length=1;
3696                    } else if(value<=0xffff) {
3697                        length=2;
3698                    } else {
3699                        length=3;
3700                    }
3701                    break;
3702                case MBCS_OUTPUT_4:
3703                    value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
3704                    if(value<=0xff) {
3705                        length=1;
3706                    } else if(value<=0xffff) {
3707                        length=2;
3708                    } else if(value<=0xffffff) {
3709                        length=3;
3710                    } else {
3711                        length=4;
3712                    }
3713                    break;
3714                case MBCS_OUTPUT_3_EUC:
3715                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3716                    /* EUC 16-bit fixed-length representation */
3717                    if(value<=0xff) {
3718                        length=1;
3719                    } else if((value&0x8000)==0) {
3720                        value|=0x8e8000;
3721                        length=3;
3722                    } else if((value&0x80)==0) {
3723                        value|=0x8f0080;
3724                        length=3;
3725                    } else {
3726                        length=2;
3727                    }
3728                    break;
3729                case MBCS_OUTPUT_4_EUC:
3730                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3731                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3732                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
3733                    if(value<=0xff) {
3734                        length=1;
3735                    } else if(value<=0xffff) {
3736                        length=2;
3737                    } else if((value&0x800000)==0) {
3738                        value|=0x8e800000;
3739                        length=4;
3740                    } else if((value&0x8000)==0) {
3741                        value|=0x8f008000;
3742                        length=4;
3743                    } else {
3744                        length=3;
3745                    }
3746                    break;
3747                default:
3748                    /* must not occur */
3749                    /*
3750                     * To avoid compiler warnings that value & length may be
3751                     * used without having been initialized, we set them here.
3752                     * In reality, this is unreachable code.
3753                     * Not having a default branch also causes warnings with
3754                     * some compilers.
3755                     */
3756                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3757                    length=0;
3758                    break;
3759                }
3760
3761                /* is this code point assigned, or do we use fallbacks? */
3762                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
3763                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3764                ) {
3765                    /*
3766                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
3767                     * There is no way with this data structure for fallback output
3768                     * to be a zero byte.
3769                     */
3770
3771unassigned:
3772                    /* try an extension mapping */
3773                    pArgs->source=source;
3774                    c=_extFromU(cnv, cnv->sharedData,
3775                                c, &source, sourceLimit,
3776                                &target, target+targetCapacity,
3777                                &offsets, sourceIndex,
3778                                pArgs->flush,
3779                                pErrorCode);
3780                    nextSourceIndex+=(int32_t)(source-pArgs->source);
3781                    prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
3782
3783                    if(U_FAILURE(*pErrorCode)) {
3784                        /* not mappable or buffer overflow */
3785                        break;
3786                    } else {
3787                        /* a mapping was written to the target, continue */
3788
3789                        /* recalculate the targetCapacity after an extension mapping */
3790                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3791
3792                        /* normal end of conversion: prepare for a new character */
3793                        if(offsets!=NULL) {
3794                            prevSourceIndex=sourceIndex;
3795                            sourceIndex=nextSourceIndex;
3796                        }
3797                        continue;
3798                    }
3799                }
3800            }
3801
3802            /* write the output character bytes from value and length */
3803            /* from the first if in the loop we know that targetCapacity>0 */
3804            if(length<=targetCapacity) {
3805                if(offsets==NULL) {
3806                    switch(length) {
3807                        /* each branch falls through to the next one */
3808                    case 4:
3809                        *target++=(uint8_t)(value>>24);
3810                    case 3:
3811                        *target++=(uint8_t)(value>>16);
3812                    case 2:
3813                        *target++=(uint8_t)(value>>8);
3814                    case 1:
3815                        *target++=(uint8_t)value;
3816                    default:
3817                        /* will never occur */
3818                        break;
3819                    }
3820                } else {
3821                    switch(length) {
3822                        /* each branch falls through to the next one */
3823                    case 4:
3824                        *target++=(uint8_t)(value>>24);
3825                        *offsets++=sourceIndex;
3826                    case 3:
3827                        *target++=(uint8_t)(value>>16);
3828                        *offsets++=sourceIndex;
3829                    case 2:
3830                        *target++=(uint8_t)(value>>8);
3831                        *offsets++=sourceIndex;
3832                    case 1:
3833                        *target++=(uint8_t)value;
3834                        *offsets++=sourceIndex;
3835                    default:
3836                        /* will never occur */
3837                        break;
3838                    }
3839                }
3840                targetCapacity-=length;
3841            } else {
3842                uint8_t *charErrorBuffer;
3843
3844                /*
3845                 * We actually do this backwards here:
3846                 * In order to save an intermediate variable, we output
3847                 * first to the overflow buffer what does not fit into the
3848                 * regular target.
3849                 */
3850                /* we know that 1<=targetCapacity<length<=4 */
3851                length-=targetCapacity;
3852                charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
3853                switch(length) {
3854                    /* each branch falls through to the next one */
3855                case 3:
3856                    *charErrorBuffer++=(uint8_t)(value>>16);
3857                case 2:
3858                    *charErrorBuffer++=(uint8_t)(value>>8);
3859                case 1:
3860                    *charErrorBuffer=(uint8_t)value;
3861                default:
3862                    /* will never occur */
3863                    break;
3864                }
3865                cnv->charErrorBufferLength=(int8_t)length;
3866
3867                /* now output what fits into the regular target */
3868                value>>=8*length; /* length was reduced by targetCapacity */
3869                switch(targetCapacity) {
3870                    /* each branch falls through to the next one */
3871                case 3:
3872                    *target++=(uint8_t)(value>>16);
3873                    if(offsets!=NULL) {
3874                        *offsets++=sourceIndex;
3875                    }
3876                case 2:
3877                    *target++=(uint8_t)(value>>8);
3878                    if(offsets!=NULL) {
3879                        *offsets++=sourceIndex;
3880                    }
3881                case 1:
3882                    *target++=(uint8_t)value;
3883                    if(offsets!=NULL) {
3884                        *offsets++=sourceIndex;
3885                    }
3886                default:
3887                    /* will never occur */
3888                    break;
3889                }
3890
3891                /* target overflow */
3892                targetCapacity=0;
3893                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3894                c=0;
3895                break;
3896            }
3897
3898            /* normal end of conversion: prepare for a new character */
3899            c=0;
3900            if(offsets!=NULL) {
3901                prevSourceIndex=sourceIndex;
3902                sourceIndex=nextSourceIndex;
3903            }
3904            continue;
3905        } else {
3906            /* target is full */
3907            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3908            break;
3909        }
3910    }
3911
3912    /*
3913     * the end of the input stream and detection of truncated input
3914     * are handled by the framework, but for EBCDIC_STATEFUL conversion
3915     * we need to emit an SI at the very end
3916     *
3917     * conditions:
3918     *   successful
3919     *   EBCDIC_STATEFUL in DBCS mode
3920     *   end of input and no truncated input
3921     */
3922    if( U_SUCCESS(*pErrorCode) &&
3923        outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
3924        pArgs->flush && source>=sourceLimit && c==0
3925    ) {
3926        /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
3927        if(targetCapacity>0) {
3928            *target++=(uint8_t)UCNV_SI;
3929            if(offsets!=NULL) {
3930                /* set the last source character's index (sourceIndex points at sourceLimit now) */
3931                *offsets++=prevSourceIndex;
3932            }
3933        } else {
3934            /* target is full */
3935            cnv->charErrorBuffer[0]=(char)UCNV_SI;
3936            cnv->charErrorBufferLength=1;
3937            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3938        }
3939        prevLength=1; /* we switched into SBCS */
3940    }
3941
3942    /* set the converter state back into UConverter */
3943    cnv->fromUChar32=c;
3944    cnv->fromUnicodeStatus=prevLength;
3945
3946    /* write back the updated pointers */
3947    pArgs->source=source;
3948    pArgs->target=(char *)target;
3949    pArgs->offsets=offsets;
3950}
3951
3952/*
3953 * This is another simple conversion function for internal use by other
3954 * conversion implementations.
3955 * It does not use the converter state nor call callbacks.
3956 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3957 * It handles conversion extensions but not GB 18030.
3958 *
3959 * It converts one single Unicode code point into codepage bytes, encoded
3960 * as one 32-bit value. The function returns the number of bytes in *pValue:
3961 * 1..4 the number of bytes in *pValue
3962 * 0    unassigned (*pValue undefined)
3963 * -1   illegal (currently not used, *pValue undefined)
3964 *
3965 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
3966 * the second to last byte in bits 15..8, etc.
3967 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3968 */
3969U_CFUNC int32_t
3970ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
3971                 UChar32 c, uint32_t *pValue,
3972                 UBool useFallback) {
3973    const int32_t *cx;
3974    const uint16_t *table;
3975#if 0
3976/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
3977    const uint8_t *p;
3978#endif
3979    uint32_t stage2Entry;
3980    uint32_t value;
3981    int32_t length;
3982
3983    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3984    if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3985        table=sharedData->mbcs.fromUnicodeTable;
3986
3987        /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3988        if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
3989            value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
3990            /* is this code point assigned, or do we use fallbacks? */
3991            if(useFallback ? value>=0x800 : value>=0xc00) {
3992                *pValue=value&0xff;
3993                return 1;
3994            }
3995        } else /* outputType!=MBCS_OUTPUT_1 */ {
3996            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3997
3998            /* get the bytes and the length for the output */
3999            switch(sharedData->mbcs.outputType) {
4000            case MBCS_OUTPUT_2:
4001                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4002                if(value<=0xff) {
4003                    length=1;
4004                } else {
4005                    length=2;
4006                }
4007                break;
4008#if 0
4009/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4010            case MBCS_OUTPUT_DBCS_ONLY:
4011                /* table with single-byte results, but only DBCS mappings used */
4012                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4013                if(value<=0xff) {
4014                    /* no mapping or SBCS result, not taken for DBCS-only */
4015                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4016                    length=0;
4017                } else {
4018                    length=2;
4019                }
4020                break;
4021            case MBCS_OUTPUT_3:
4022                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4023                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4024                if(value<=0xff) {
4025                    length=1;
4026                } else if(value<=0xffff) {
4027                    length=2;
4028                } else {
4029                    length=3;
4030                }
4031                break;
4032            case MBCS_OUTPUT_4:
4033                value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4034                if(value<=0xff) {
4035                    length=1;
4036                } else if(value<=0xffff) {
4037                    length=2;
4038                } else if(value<=0xffffff) {
4039                    length=3;
4040                } else {
4041                    length=4;
4042                }
4043                break;
4044            case MBCS_OUTPUT_3_EUC:
4045                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4046                /* EUC 16-bit fixed-length representation */
4047                if(value<=0xff) {
4048                    length=1;
4049                } else if((value&0x8000)==0) {
4050                    value|=0x8e8000;
4051                    length=3;
4052                } else if((value&0x80)==0) {
4053                    value|=0x8f0080;
4054                    length=3;
4055                } else {
4056                    length=2;
4057                }
4058                break;
4059            case MBCS_OUTPUT_4_EUC:
4060                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4061                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4062                /* EUC 16-bit fixed-length representation applied to the first two bytes */
4063                if(value<=0xff) {
4064                    length=1;
4065                } else if(value<=0xffff) {
4066                    length=2;
4067                } else if((value&0x800000)==0) {
4068                    value|=0x8e800000;
4069                    length=4;
4070                } else if((value&0x8000)==0) {
4071                    value|=0x8f008000;
4072                    length=4;
4073                } else {
4074                    length=3;
4075                }
4076                break;
4077#endif
4078            default:
4079                /* must not occur */
4080                return -1;
4081            }
4082
4083            /* is this code point assigned, or do we use fallbacks? */
4084            if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4085                (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4086            ) {
4087                /*
4088                 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4089                 * There is no way with this data structure for fallback output
4090                 * to be a zero byte.
4091                 */
4092                /* assigned */
4093                *pValue=value;
4094                return length;
4095            }
4096        }
4097    }
4098
4099    cx=sharedData->mbcs.extIndexes;
4100    if(cx!=NULL) {
4101        length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4102        return length>=0 ? length : -length;  /* return abs(length); */
4103    }
4104
4105    /* unassigned */
4106    return 0;
4107}
4108
4109
4110#if 0
4111/*
4112 * This function has been moved to ucnv2022.c for inlining.
4113 * This implementation is here only for documentation purposes
4114 */
4115
4116/**
4117 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4118 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4119 * It does not handle conversion extensions (_extFromU()).
4120 *
4121 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4122 */
4123U_CFUNC int32_t
4124ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
4125                       UChar32 c,
4126                       UBool useFallback) {
4127    const uint16_t *table;
4128    int32_t value;
4129
4130    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4131    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4132        return -1;
4133    }
4134
4135    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4136    table=sharedData->mbcs.fromUnicodeTable;
4137
4138    /* get the byte for the output */
4139    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4140    /* is this code point assigned, or do we use fallbacks? */
4141    if(useFallback ? value>=0x800 : value>=0xc00) {
4142        return value&0xff;
4143    } else {
4144        return -1;
4145    }
4146}
4147#endif
4148
4149/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4150
4151/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4152static const UChar32
4153utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4154
4155/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4156static const UChar32
4157utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4158
4159static void
4160ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4161                  UConverterToUnicodeArgs *pToUArgs,
4162                  UErrorCode *pErrorCode) {
4163    UConverter *utf8, *cnv;
4164    const uint8_t *source, *sourceLimit;
4165    uint8_t *target;
4166    int32_t targetCapacity;
4167
4168    const uint16_t *table, *sbcsIndex;
4169    const uint16_t *results;
4170
4171    int8_t oldToULength, toULength, toULimit;
4172
4173    UChar32 c;
4174    uint8_t b, t1, t2;
4175
4176    uint32_t asciiRoundtrips;
4177    uint16_t value, minValue;
4178    UBool hasSupplementary;
4179
4180    /* set up the local pointers */
4181    utf8=pToUArgs->converter;
4182    cnv=pFromUArgs->converter;
4183    source=(uint8_t *)pToUArgs->source;
4184    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4185    target=(uint8_t *)pFromUArgs->target;
4186    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4187
4188    table=cnv->sharedData->mbcs.fromUnicodeTable;
4189    sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
4190    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4191        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4192    } else {
4193        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4194    }
4195    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4196
4197    if(cnv->useFallback) {
4198        /* use all roundtrip and fallback results */
4199        minValue=0x800;
4200    } else {
4201        /* use only roundtrips and fallbacks from private-use characters */
4202        minValue=0xc00;
4203    }
4204    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4205
4206    /* get the converter state from the UTF-8 UConverter */
4207    c=(UChar32)utf8->toUnicodeStatus;
4208    if(c!=0) {
4209        toULength=oldToULength=utf8->toULength;
4210        toULimit=(int8_t)utf8->mode;
4211    } else {
4212        toULength=oldToULength=toULimit=0;
4213    }
4214
4215    /*
4216     * Make sure that the last byte sequence before sourceLimit is complete
4217     * or runs into a lead byte.
4218     * Do not go back into the bytes that will be read for finishing a partial
4219     * sequence from the previous buffer.
4220     * In the conversion loop compare source with sourceLimit only once
4221     * per multi-byte character.
4222     */
4223    {
4224        int32_t i, length;
4225
4226        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4227        for(i=0; i<3 && i<length;) {
4228            b=*(sourceLimit-i-1);
4229            if(U8_IS_TRAIL(b)) {
4230                ++i;
4231            } else {
4232                if(i<utf8_countTrailBytes[b]) {
4233                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4234                    sourceLimit-=i+1;
4235                }
4236                break;
4237            }
4238        }
4239    }
4240
4241    if(c!=0 && targetCapacity>0) {
4242        utf8->toUnicodeStatus=0;
4243        utf8->toULength=0;
4244        goto moreBytes;
4245        /*
4246         * Note: We could avoid the goto by duplicating some of the moreBytes
4247         * code, but only up to the point of collecting a complete UTF-8
4248         * sequence; then recurse for the toUBytes[toULength]
4249         * and then continue with normal conversion.
4250         *
4251         * If so, move this code to just after initializing the minimum
4252         * set of local variables for reading the UTF-8 input
4253         * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4254         *
4255         * Potential advantages:
4256         * - avoid the goto
4257         * - oldToULength could become a local variable in just those code blocks
4258         *   that deal with buffer boundaries
4259         * - possibly faster if the goto prevents some compiler optimizations
4260         *   (this would need measuring to confirm)
4261         * Disadvantage:
4262         * - code duplication
4263         */
4264    }
4265
4266    /* conversion loop */
4267    while(source<sourceLimit) {
4268        if(targetCapacity>0) {
4269            b=*source++;
4270            if((int8_t)b>=0) {
4271                /* convert ASCII */
4272                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4273                    *target++=(uint8_t)b;
4274                    --targetCapacity;
4275                    continue;
4276                } else {
4277                    c=b;
4278                    value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
4279                }
4280            } else {
4281                if(b<0xe0) {
4282                    if( /* handle U+0080..U+07FF inline */
4283                        b>=0xc2 &&
4284                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
4285                    ) {
4286                        c=b&0x1f;
4287                        ++source;
4288                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
4289                        if(value>=minValue) {
4290                            *target++=(uint8_t)value;
4291                            --targetCapacity;
4292                            continue;
4293                        } else {
4294                            c=(c<<6)|t1;
4295                        }
4296                    } else {
4297                        c=-1;
4298                    }
4299                } else if(b==0xe0) {
4300                    if( /* handle U+0800..U+0FFF inline */
4301                        (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
4302                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
4303                    ) {
4304                        c=t1;
4305                        source+=2;
4306                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
4307                        if(value>=minValue) {
4308                            *target++=(uint8_t)value;
4309                            --targetCapacity;
4310                            continue;
4311                        } else {
4312                            c=(c<<6)|t2;
4313                        }
4314                    } else {
4315                        c=-1;
4316                    }
4317                } else {
4318                    c=-1;
4319                }
4320
4321                if(c<0) {
4322                    /* handle "complicated" and error cases, and continuing partial characters */
4323                    oldToULength=0;
4324                    toULength=1;
4325                    toULimit=utf8_countTrailBytes[b]+1;
4326                    c=b;
4327moreBytes:
4328                    while(toULength<toULimit) {
4329                        if(source<sourceLimit) {
4330                            b=*source;
4331                            if(U8_IS_TRAIL(b)) {
4332                                ++source;
4333                                ++toULength;
4334                                c=(c<<6)+b;
4335                            } else {
4336                                break; /* sequence too short, stop with toULength<toULimit */
4337                            }
4338                        } else {
4339                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4340                            source-=(toULength-oldToULength);
4341                            while(oldToULength<toULength) {
4342                                utf8->toUBytes[oldToULength++]=*source++;
4343                            }
4344                            utf8->toUnicodeStatus=c;
4345                            utf8->toULength=toULength;
4346                            utf8->mode=toULimit;
4347                            pToUArgs->source=(char *)source;
4348                            pFromUArgs->target=(char *)target;
4349                            return;
4350                        }
4351                    }
4352
4353                    if( toULength==toULimit &&      /* consumed all trail bytes */
4354                        (toULength==3 || toULength==2) &&             /* BMP */
4355                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
4356                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
4357                    ) {
4358                        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4359                    } else if(
4360                        toULength==toULimit && toULength==4 &&
4361                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
4362                    ) {
4363                        /* supplementary code point */
4364                        if(!hasSupplementary) {
4365                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4366                            value=0;
4367                        } else {
4368                            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4369                        }
4370                    } else {
4371                        /* error handling: illegal UTF-8 byte sequence */
4372                        source-=(toULength-oldToULength);
4373                        while(oldToULength<toULength) {
4374                            utf8->toUBytes[oldToULength++]=*source++;
4375                        }
4376                        utf8->toULength=toULength;
4377                        pToUArgs->source=(char *)source;
4378                        pFromUArgs->target=(char *)target;
4379                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4380                        return;
4381                    }
4382                }
4383            }
4384
4385            if(value>=minValue) {
4386                /* output the mapping for c */
4387                *target++=(uint8_t)value;
4388                --targetCapacity;
4389            } else {
4390                /* value<minValue means c is unassigned (unmappable) */
4391                /*
4392                 * Try an extension mapping.
4393                 * Pass in no source because we don't have UTF-16 input.
4394                 * If we have a partial match on c, we will return and revert
4395                 * to UTF-8->UTF-16->charset conversion.
4396                 */
4397                static const UChar nul=0;
4398                const UChar *noSource=&nul;
4399                c=_extFromU(cnv, cnv->sharedData,
4400                            c, &noSource, noSource,
4401                            &target, target+targetCapacity,
4402                            NULL, -1,
4403                            pFromUArgs->flush,
4404                            pErrorCode);
4405
4406                if(U_FAILURE(*pErrorCode)) {
4407                    /* not mappable or buffer overflow */
4408                    cnv->fromUChar32=c;
4409                    break;
4410                } else if(cnv->preFromUFirstCP>=0) {
4411                    /*
4412                     * Partial match, return and revert to pivoting.
4413                     * In normal from-UTF-16 conversion, we would just continue
4414                     * but then exit the loop because the extension match would
4415                     * have consumed the source.
4416                     */
4417                    break;
4418                } else {
4419                    /* a mapping was written to the target, continue */
4420
4421                    /* recalculate the targetCapacity after an extension mapping */
4422                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
4423                }
4424            }
4425        } else {
4426            /* target is full */
4427            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4428            break;
4429        }
4430    }
4431
4432    /*
4433     * The sourceLimit may have been adjusted before the conversion loop
4434     * to stop before a truncated sequence.
4435     * If so, then collect the truncated sequence now.
4436     */
4437    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
4438        c=utf8->toUBytes[0]=b=*source++;
4439        toULength=1;
4440        toULimit=utf8_countTrailBytes[b]+1;
4441        while(source<sourceLimit) {
4442            utf8->toUBytes[toULength++]=b=*source++;
4443            c=(c<<6)+b;
4444        }
4445        utf8->toUnicodeStatus=c;
4446        utf8->toULength=toULength;
4447        utf8->mode=toULimit;
4448    }
4449
4450    /* write back the updated pointers */
4451    pToUArgs->source=(char *)source;
4452    pFromUArgs->target=(char *)target;
4453}
4454
4455static void
4456ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4457                  UConverterToUnicodeArgs *pToUArgs,
4458                  UErrorCode *pErrorCode) {
4459    UConverter *utf8, *cnv;
4460    const uint8_t *source, *sourceLimit;
4461    uint8_t *target;
4462    int32_t targetCapacity;
4463
4464    const uint16_t *table, *mbcsIndex;
4465    const uint16_t *results;
4466
4467    int8_t oldToULength, toULength, toULimit;
4468
4469    UChar32 c;
4470    uint8_t b, t1, t2;
4471
4472    uint32_t stage2Entry;
4473    uint32_t asciiRoundtrips;
4474    uint16_t value, minValue;
4475    UBool hasSupplementary;
4476
4477    /* set up the local pointers */
4478    utf8=pToUArgs->converter;
4479    cnv=pFromUArgs->converter;
4480    source=(uint8_t *)pToUArgs->source;
4481    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4482    target=(uint8_t *)pFromUArgs->target;
4483    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4484
4485    table=cnv->sharedData->mbcs.fromUnicodeTable;
4486    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
4487    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4488        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4489    } else {
4490        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4491    }
4492    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4493
4494    if(cnv->useFallback) {
4495        /* use all roundtrip and fallback results */
4496        minValue=0x800;
4497    } else {
4498        /* use only roundtrips and fallbacks from private-use characters */
4499        minValue=0xc00;
4500    }
4501    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4502
4503    /* get the converter state from the UTF-8 UConverter */
4504    c=(UChar32)utf8->toUnicodeStatus;
4505    if(c!=0) {
4506        toULength=oldToULength=utf8->toULength;
4507        toULimit=(int8_t)utf8->mode;
4508    } else {
4509        toULength=oldToULength=toULimit=0;
4510    }
4511
4512    /*
4513     * Make sure that the last byte sequence before sourceLimit is complete
4514     * or runs into a lead byte.
4515     * Do not go back into the bytes that will be read for finishing a partial
4516     * sequence from the previous buffer.
4517     * In the conversion loop compare source with sourceLimit only once
4518     * per multi-byte character.
4519     */
4520    {
4521        int32_t i, length;
4522
4523        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4524        for(i=0; i<3 && i<length;) {
4525            b=*(sourceLimit-i-1);
4526            if(U8_IS_TRAIL(b)) {
4527                ++i;
4528            } else {
4529                if(i<utf8_countTrailBytes[b]) {
4530                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4531                    sourceLimit-=i+1;
4532                }
4533                break;
4534            }
4535        }
4536    }
4537
4538    if(c!=0 && targetCapacity>0) {
4539        utf8->toUnicodeStatus=0;
4540        utf8->toULength=0;
4541        goto moreBytes;
4542        /* See note in ucnv_SBCSFromUTF8() about this goto. */
4543    }
4544
4545    /* conversion loop */
4546    while(source<sourceLimit) {
4547        if(targetCapacity>0) {
4548            b=*source++;
4549            if((int8_t)b>=0) {
4550                /* convert ASCII */
4551                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4552                    *target++=b;
4553                    --targetCapacity;
4554                    continue;
4555                } else {
4556                    value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
4557                    if(value==0) {
4558                        c=b;
4559                        goto unassigned;
4560                    }
4561                }
4562            } else {
4563                if(b>0xe0) {
4564                    if( /* handle U+1000..U+D7FF inline */
4565                        (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
4566                                                        (b==0xed && (t1 <= 0x1f))) &&
4567                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
4568                    ) {
4569                        c=((b&0xf)<<6)|t1;
4570                        source+=2;
4571                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
4572                        if(value==0) {
4573                            c=(c<<6)|t2;
4574                            goto unassigned;
4575                        }
4576                    } else {
4577                        c=-1;
4578                    }
4579                } else if(b<0xe0) {
4580                    if( /* handle U+0080..U+07FF inline */
4581                        b>=0xc2 &&
4582                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
4583                    ) {
4584                        c=b&0x1f;
4585                        ++source;
4586                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
4587                        if(value==0) {
4588                            c=(c<<6)|t1;
4589                            goto unassigned;
4590                        }
4591                    } else {
4592                        c=-1;
4593                    }
4594                } else {
4595                    c=-1;
4596                }
4597
4598                if(c<0) {
4599                    /* handle "complicated" and error cases, and continuing partial characters */
4600                    oldToULength=0;
4601                    toULength=1;
4602                    toULimit=utf8_countTrailBytes[b]+1;
4603                    c=b;
4604moreBytes:
4605                    while(toULength<toULimit) {
4606                        if(source<sourceLimit) {
4607                            b=*source;
4608                            if(U8_IS_TRAIL(b)) {
4609                                ++source;
4610                                ++toULength;
4611                                c=(c<<6)+b;
4612                            } else {
4613                                break; /* sequence too short, stop with toULength<toULimit */
4614                            }
4615                        } else {
4616                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4617                            source-=(toULength-oldToULength);
4618                            while(oldToULength<toULength) {
4619                                utf8->toUBytes[oldToULength++]=*source++;
4620                            }
4621                            utf8->toUnicodeStatus=c;
4622                            utf8->toULength=toULength;
4623                            utf8->mode=toULimit;
4624                            pToUArgs->source=(char *)source;
4625                            pFromUArgs->target=(char *)target;
4626                            return;
4627                        }
4628                    }
4629
4630                    if( toULength==toULimit &&      /* consumed all trail bytes */
4631                        (toULength==3 || toULength==2) &&             /* BMP */
4632                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
4633                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
4634                    ) {
4635                        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4636                    } else if(
4637                        toULength==toULimit && toULength==4 &&
4638                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
4639                    ) {
4640                        /* supplementary code point */
4641                        if(!hasSupplementary) {
4642                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4643                            stage2Entry=0;
4644                        } else {
4645                            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4646                        }
4647                    } else {
4648                        /* error handling: illegal UTF-8 byte sequence */
4649                        source-=(toULength-oldToULength);
4650                        while(oldToULength<toULength) {
4651                            utf8->toUBytes[oldToULength++]=*source++;
4652                        }
4653                        utf8->toULength=toULength;
4654                        pToUArgs->source=(char *)source;
4655                        pFromUArgs->target=(char *)target;
4656                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4657                        return;
4658                    }
4659
4660                    /* get the bytes and the length for the output */
4661                    /* MBCS_OUTPUT_2 */
4662                    value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
4663
4664                    /* is this code point assigned, or do we use fallbacks? */
4665                    if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4666                         (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4667                    ) {
4668                        goto unassigned;
4669                    }
4670                }
4671            }
4672
4673            /* write the output character bytes from value and length */
4674            /* from the first if in the loop we know that targetCapacity>0 */
4675            if(value<=0xff) {
4676                /* this is easy because we know that there is enough space */
4677                *target++=(uint8_t)value;
4678                --targetCapacity;
4679            } else /* length==2 */ {
4680                *target++=(uint8_t)(value>>8);
4681                if(2<=targetCapacity) {
4682                    *target++=(uint8_t)value;
4683                    targetCapacity-=2;
4684                } else {
4685                    cnv->charErrorBuffer[0]=(char)value;
4686                    cnv->charErrorBufferLength=1;
4687
4688                    /* target overflow */
4689                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4690                    break;
4691                }
4692            }
4693            continue;
4694
4695unassigned:
4696            {
4697                /*
4698                 * Try an extension mapping.
4699                 * Pass in no source because we don't have UTF-16 input.
4700                 * If we have a partial match on c, we will return and revert
4701                 * to UTF-8->UTF-16->charset conversion.
4702                 */
4703                static const UChar nul=0;
4704                const UChar *noSource=&nul;
4705                c=_extFromU(cnv, cnv->sharedData,
4706                            c, &noSource, noSource,
4707                            &target, target+targetCapacity,
4708                            NULL, -1,
4709                            pFromUArgs->flush,
4710                            pErrorCode);
4711
4712                if(U_FAILURE(*pErrorCode)) {
4713                    /* not mappable or buffer overflow */
4714                    cnv->fromUChar32=c;
4715                    break;
4716                } else if(cnv->preFromUFirstCP>=0) {
4717                    /*
4718                     * Partial match, return and revert to pivoting.
4719                     * In normal from-UTF-16 conversion, we would just continue
4720                     * but then exit the loop because the extension match would
4721                     * have consumed the source.
4722                     */
4723                    break;
4724                } else {
4725                    /* a mapping was written to the target, continue */
4726
4727                    /* recalculate the targetCapacity after an extension mapping */
4728                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
4729                    continue;
4730                }
4731            }
4732        } else {
4733            /* target is full */
4734            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4735            break;
4736        }
4737    }
4738
4739    /*
4740     * The sourceLimit may have been adjusted before the conversion loop
4741     * to stop before a truncated sequence.
4742     * If so, then collect the truncated sequence now.
4743     */
4744    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
4745        c=utf8->toUBytes[0]=b=*source++;
4746        toULength=1;
4747        toULimit=utf8_countTrailBytes[b]+1;
4748        while(source<sourceLimit) {
4749            utf8->toUBytes[toULength++]=b=*source++;
4750            c=(c<<6)+b;
4751        }
4752        utf8->toUnicodeStatus=c;
4753        utf8->toULength=toULength;
4754        utf8->mode=toULimit;
4755    }
4756
4757    /* write back the updated pointers */
4758    pToUArgs->source=(char *)source;
4759    pFromUArgs->target=(char *)target;
4760}
4761
4762/* miscellaneous ------------------------------------------------------------ */
4763
4764static void
4765ucnv_MBCSGetStarters(const UConverter* cnv,
4766                 UBool starters[256],
4767                 UErrorCode *pErrorCode) {
4768    const int32_t *state0;
4769    int i;
4770
4771    state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
4772    for(i=0; i<256; ++i) {
4773        /* all bytes that cause a state transition from state 0 are lead bytes */
4774        starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
4775    }
4776}
4777
4778/*
4779 * This is an internal function that allows other converter implementations
4780 * to check whether a byte is a lead byte.
4781 */
4782U_CFUNC UBool
4783ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
4784    return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
4785}
4786
4787static void
4788ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
4789              int32_t offsetIndex,
4790              UErrorCode *pErrorCode) {
4791    UConverter *cnv=pArgs->converter;
4792    char *p, *subchar;
4793    char buffer[4];
4794    int32_t length;
4795
4796    /* first, select between subChar and subChar1 */
4797    if( cnv->subChar1!=0 &&
4798        (cnv->sharedData->mbcs.extIndexes!=NULL ?
4799            cnv->useSubChar1 :
4800            (cnv->invalidUCharBuffer[0]<=0xff))
4801    ) {
4802        /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
4803        subchar=(char *)&cnv->subChar1;
4804        length=1;
4805    } else {
4806        /* select subChar in all other cases */
4807        subchar=(char *)cnv->subChars;
4808        length=cnv->subCharLen;
4809    }
4810
4811    /* reset the selector for the next code point */
4812    cnv->useSubChar1=FALSE;
4813
4814    if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
4815        p=buffer;
4816
4817        /* fromUnicodeStatus contains prevLength */
4818        switch(length) {
4819        case 1:
4820            if(cnv->fromUnicodeStatus==2) {
4821                /* DBCS mode and SBCS sub char: change to SBCS */
4822                cnv->fromUnicodeStatus=1;
4823                *p++=UCNV_SI;
4824            }
4825            *p++=subchar[0];
4826            break;
4827        case 2:
4828            if(cnv->fromUnicodeStatus<=1) {
4829                /* SBCS mode and DBCS sub char: change to DBCS */
4830                cnv->fromUnicodeStatus=2;
4831                *p++=UCNV_SO;
4832            }
4833            *p++=subchar[0];
4834            *p++=subchar[1];
4835            break;
4836        default:
4837            *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
4838            return;
4839        }
4840        subchar=buffer;
4841        length=(int32_t)(p-buffer);
4842    }
4843
4844    ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
4845}
4846
4847U_CFUNC UConverterType
4848ucnv_MBCSGetType(const UConverter* converter) {
4849    /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
4850    if(converter->sharedData->mbcs.countStates==1) {
4851        return (UConverterType)UCNV_SBCS;
4852    } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
4853        return (UConverterType)UCNV_EBCDIC_STATEFUL;
4854    } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
4855        return (UConverterType)UCNV_DBCS;
4856    }
4857    return (UConverterType)UCNV_MBCS;
4858}
4859
4860static const UConverterImpl _SBCSUTF8Impl={
4861    UCNV_MBCS,
4862
4863    ucnv_MBCSLoad,
4864    ucnv_MBCSUnload,
4865
4866    ucnv_MBCSOpen,
4867    NULL,
4868    NULL,
4869
4870    ucnv_MBCSToUnicodeWithOffsets,
4871    ucnv_MBCSToUnicodeWithOffsets,
4872    ucnv_MBCSFromUnicodeWithOffsets,
4873    ucnv_MBCSFromUnicodeWithOffsets,
4874    ucnv_MBCSGetNextUChar,
4875
4876    ucnv_MBCSGetStarters,
4877    ucnv_MBCSGetName,
4878    ucnv_MBCSWriteSub,
4879    NULL,
4880    ucnv_MBCSGetUnicodeSet,
4881
4882    NULL,
4883    ucnv_SBCSFromUTF8
4884};
4885
4886static const UConverterImpl _DBCSUTF8Impl={
4887    UCNV_MBCS,
4888
4889    ucnv_MBCSLoad,
4890    ucnv_MBCSUnload,
4891
4892    ucnv_MBCSOpen,
4893    NULL,
4894    NULL,
4895
4896    ucnv_MBCSToUnicodeWithOffsets,
4897    ucnv_MBCSToUnicodeWithOffsets,
4898    ucnv_MBCSFromUnicodeWithOffsets,
4899    ucnv_MBCSFromUnicodeWithOffsets,
4900    ucnv_MBCSGetNextUChar,
4901
4902    ucnv_MBCSGetStarters,
4903    ucnv_MBCSGetName,
4904    ucnv_MBCSWriteSub,
4905    NULL,
4906    ucnv_MBCSGetUnicodeSet,
4907
4908    NULL,
4909    ucnv_DBCSFromUTF8
4910};
4911
4912static const UConverterImpl _MBCSImpl={
4913    UCNV_MBCS,
4914
4915    ucnv_MBCSLoad,
4916    ucnv_MBCSUnload,
4917
4918    ucnv_MBCSOpen,
4919    NULL,
4920    NULL,
4921
4922    ucnv_MBCSToUnicodeWithOffsets,
4923    ucnv_MBCSToUnicodeWithOffsets,
4924    ucnv_MBCSFromUnicodeWithOffsets,
4925    ucnv_MBCSFromUnicodeWithOffsets,
4926    ucnv_MBCSGetNextUChar,
4927
4928    ucnv_MBCSGetStarters,
4929    ucnv_MBCSGetName,
4930    ucnv_MBCSWriteSub,
4931    NULL,
4932    ucnv_MBCSGetUnicodeSet
4933};
4934
4935
4936/* Static data is in tools/makeconv/ucnvstat.c for data-based
4937 * converters. Be sure to update it as well.
4938 */
4939
4940const UConverterSharedData _MBCSData={
4941    sizeof(UConverterSharedData), 1,
4942    NULL, NULL, NULL, FALSE, &_MBCSImpl,
4943    0
4944};
4945
4946#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
4947