1/*
2******************************************************************************
3*
4*   Copyright (C) 2000-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ucnvmbcs.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000jul03
14*   created by: Markus W. Scherer
15*
16*   The current code in this file replaces the previous implementation
17*   of conversion code from multi-byte codepages to Unicode and back.
18*   This implementation supports the following:
19*   - legacy variable-length codepages with up to 4 bytes per character
20*   - all Unicode code points (up to 0x10ffff)
21*   - efficient distinction of unassigned vs. illegal byte sequences
22*   - it is possible in fromUnicode() to directly deal with simple
23*     stateful encodings (used for EBCDIC_STATEFUL)
24*   - it is possible to convert Unicode code points
25*     to a single zero byte (but not as a fallback except for SBCS)
26*
27*   Remaining limitations in fromUnicode:
28*   - byte sequences must not have leading zero bytes
29*   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30*   - limitation to up to 4 bytes per character
31*
32*   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33*   limitations and adds m:n character mappings and other features.
34*   See ucnv_ext.h for details.
35*
36*   Change history:
37*
38*    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39*                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40*                             macros to ucnvmbcs.h file
41*/
42
43#include "unicode/utypes.h"
44
45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
46
47#include "unicode/ucnv.h"
48#include "unicode/ucnv_cb.h"
49#include "unicode/udata.h"
50#include "unicode/uset.h"
51#include "ucnv_bld.h"
52#include "ucnvmbcs.h"
53#include "ucnv_ext.h"
54#include "ucnv_cnv.h"
55#include "umutex.h"
56#include "cmemory.h"
57#include "cstring.h"
58
59/* control optimizations according to the platform */
60#define MBCS_UNROLL_SINGLE_TO_BMP 1
61#define MBCS_UNROLL_SINGLE_FROM_BMP 0
62
63/*
64 * _MBCSHeader versions 5.3 & 4.3
65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
66 *
67 * This version is optional. Version 5 is used for incompatible data format changes.
68 * makeconv will continue to generate version 4 files if possible.
69 *
70 * Changes from version 4:
71 *
72 * The main difference is an additional _MBCSHeader field with
73 * - the length (number of uint32_t) of the _MBCSHeader
74 * - flags for further incompatible data format changes
75 * - flags for further, backward compatible data format changes
76 *
77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
78 * the file and needs to be reconstituted at load time.
79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
81 * (For details about these structures see below, and see ucnvmbcs.h.)
82 *
83 *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
84 *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
85 *   precision markers for all mappings.)
86 *
87 *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
88 *   omitted data that can be reconstituted from the toUnicode data.
89 *
90 *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
91 *   With only roundtrip mappings in the base fromUnicode data, this part is fully
92 *   redundant with the mbcsIndex and will be reconstituted from that (also using the
93 *   stage 1 table which contains the information about how stage 2 was compacted).
94 *
95 *   The rest of the stage 2 table, the part for code points above maxFastUChar,
96 *   is stored in the file and will be appended to the reconstituted part.
97 *
98 *   The entire fromUBytes array is omitted from the file and will be reconstitued.
99 *   This is done by enumerating all toUnicode roundtrip mappings, performing
100 *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
101 *   writing instead of reading the byte values.
102 *
103 * _MBCSHeader version 4.3
104 *
105 * Change from version 4.2:
106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
107 *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
108 *   files which can be used instead of stages 1 & 2.
109 *   Faster lookups for roundtrips from most commonly used characters,
110 *   and lookups from UTF-8 byte sequences with a natural bit distribution.
111 *   See ucnvmbcs.h for more details.
112 *
113 * Change from version 4.1:
114 * - Added an optional extension table structure at the end of the .cnv file.
115 *   It is present if the upper bits of the header flags field contains a non-zero
116 *   byte offset to it.
117 *   Files that contain only a conversion table and no base table
118 *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
119 *   These contain the base table name between the MBCS header and the extension
120 *   data.
121 *
122 * Change from version 4.0:
123 * - Replace header.reserved with header.fromUBytesLength so that all
124 *   fields in the data have length.
125 *
126 * Changes from version 3 (for performance improvements):
127 * - new bit distribution for state table entries
128 * - reordered action codes
129 * - new data structure for single-byte fromUnicode
130 *   + stage 2 only contains indexes
131 *   + stage 3 stores 16 bits per character with classification bits 15..8
132 * - no multiplier for stage 1 entries
133 * - stage 2 for non-single-byte codepages contains the index and the flags in
134 *   one 32-bit value
135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
136 *
137 * For more details about old versions of the MBCS data structure, see
138 * the corresponding versions of this file.
139 *
140 * Converting stateless codepage data ---------------------------------------***
141 * (or codepage data with simple states) to Unicode.
142 *
143 * Data structure and algorithm for converting from complex legacy codepages
144 * to Unicode. (Designed before 2000-may-22.)
145 *
146 * The basic idea is that the structure of legacy codepages can be described
147 * with state tables.
148 * When reading a byte stream, each input byte causes a state transition.
149 * Some transitions result in the output of a code point, some result in
150 * "unassigned" or "illegal" output.
151 * This is used here for character conversion.
152 *
153 * The data structure begins with a state table consisting of a row
154 * per state, with 256 entries (columns) per row for each possible input
155 * byte value.
156 * Each entry is 32 bits wide, with two formats distinguished by
157 * the sign bit (bit 31):
158 *
159 * One format for transitional entries (bit 31 not set) for non-final bytes, and
160 * one format for final entries (bit 31 set).
161 * Both formats contain the number of the next state in the same bit
162 * positions.
163 * State 0 is the initial state.
164 *
165 * Most of the time, the offset values of subsequent states are added
166 * up to a scalar value. This value will eventually be the index of
167 * the Unicode code point in a table that follows the state table.
168 * The effect is that the code points for final state table rows
169 * are contiguous. The code points of final state rows follow each other
170 * in the order of the references to those final states by previous
171 * states, etc.
172 *
173 * For some terminal states, the offset is itself the output Unicode
174 * code point (16 bits for a BMP code point or 20 bits for a supplementary
175 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
176 * For others, the code point in the Unicode table is stored with either
177 * one or two code units: one for BMP code points, two for a pair of
178 * surrogates.
179 * All code points for a final state entry take up the same number of code
180 * units, regardless of whether they all actually _use_ the same number
181 * of code units. This is necessary for simple array access.
182 *
183 * An additional feature comes in with what in ICU is called "fallback"
184 * mappings:
185 *
186 * In addition to round-trippable, precise, 1:1 mappings, there are often
187 * mappings defined between similar, though not the same, characters.
188 * Typically, such mappings occur only in fromUnicode mapping tables because
189 * Unicode has a superset repertoire of most other codepages. However, it
190 * is possible to provide such mappings in the toUnicode tables, too.
191 * In this case, the fallback mappings are partly integrated into the
192 * general state tables because the structure of the encoding includes their
193 * byte sequences.
194 * For final entries in an initial state, fallback mappings are stored in
195 * the entry itself like with roundtrip mappings.
196 * For other final entries, they are stored in the code units table if
197 * the entry is for a pair of code units.
198 * For single-unit results in the code units table, there is no space to
199 * alternatively hold a fallback mapping; in this case, the code unit
200 * is stored as U+fffe (unassigned), and the fallback mapping needs to
201 * be looked up by the scalar offset value in a separate table.
202 *
203 * "Unassigned" state entries really mean "structurally unassigned",
204 * i.e., such a byte sequence will never have a mapping result.
205 *
206 * The interpretation of the bits in each entry is as follows:
207 *
208 * Bit 31 not set, not a terminal entry ("transitional"):
209 * 30..24 next state
210 * 23..0  offset delta, to be added up
211 *
212 * Bit 31 set, terminal ("final") entry:
213 * 30..24 next state (regardless of action code)
214 * 23..20 action code:
215 *        action codes 0 and 1 result in precise-mapping Unicode code points
216 *        0  valid byte sequence
217 *           19..16 not used, 0
218 *           15..0  16-bit Unicode BMP code point
219 *                  never U+fffe or U+ffff
220 *        1  valid byte sequence
221 *           19..0  20-bit Unicode supplementary code point
222 *                  never U+fffe or U+ffff
223 *
224 *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
225 *        2  valid byte sequence (fallback)
226 *           19..16 not used, 0
227 *           15..0  16-bit Unicode BMP code point as fallback result
228 *        3  valid byte sequence (fallback)
229 *           19..0  20-bit Unicode supplementary code point as fallback result
230 *
231 *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
232 *        depending on the code units they result in
233 *        4  valid byte sequence
234 *           19..9  not used, 0
235 *            8..0  final offset delta
236 *                  pointing to one 16-bit code unit which may be
237 *                  fffe  unassigned -- look for a fallback for this offset
238 *                  ffff  illegal
239 *        5  valid byte sequence
240 *           19..9  not used, 0
241 *            8..0  final offset delta
242 *                  pointing to two 16-bit code units
243 *                  (typically UTF-16 surrogates)
244 *                  the result depends on the first code unit as follows:
245 *                  0000..d7ff  roundtrip BMP code point (1st alone)
246 *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
247 *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
248 *                  e000        roundtrip BMP code point (2nd alone)
249 *                  e001        fallback BMP code point (2nd alone)
250 *                  fffe        unassigned
251 *                  ffff        illegal
252 *           (the final offset deltas are at most 255 * 2,
253 *            times 2 because of storing code unit pairs)
254 *
255 *        6  unassigned byte sequence
256 *           19..16 not used, 0
257 *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
258 *                  this does not contain a final offset delta because the main
259 *                  purpose of this action code is to save scalar offset values;
260 *                  therefore, fallback values cannot be assigned to byte
261 *                  sequences that result in this action code
262 *        7  illegal byte sequence
263 *           19..16 not used, 0
264 *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
265 *        8  state change only
266 *           19..0  not used, 0
267 *           useful for state changes in simple stateful encodings,
268 *           at Shift-In/Shift-Out codes
269 *
270 *
271 *        9..15 reserved for future use
272 *           current implementations will only perform a state change
273 *           and ignore bits 19..0
274 *
275 * An encoding with contiguous ranges of unassigned byte sequences, like
276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
277 * at least two states for the trail bytes:
278 * One trail byte state that results in code points, and one that only
279 * has "unassigned" and "illegal" terminal states.
280 *
281 * Note: partly by accident, this data structure supports simple stateful
282 * encodings without any additional logic.
283 * Currently, only simple Shift-In/Shift-Out schemes are handled with
284 * appropriate state tables (especially EBCDIC_STATEFUL!).
285 *
286 * MBCS version 2 added:
287 * unassigned and illegal action codes have U+fffe and U+ffff
288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
289 *
290 * Converting from Unicode to codepage bytes --------------------------------***
291 *
292 * The conversion data structure for fromUnicode is designed for the known
293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
295 * a roundtrip mapping.
296 *
297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
298 * like in the character properties table.
299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
300 * with the resulting bytes is at offsetFromUBytes.
301 *
302 * Beginning with version 4, single-byte codepages have a significantly different
303 * trie compared to other codepages.
304 * In all cases, the entry in stage 1 is directly the index of the block of
305 * 64 entries in stage 2.
306 *
307 * Single-byte lookup:
308 *
309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
310 * Stage 3 contains one 16-bit word per result:
311 * Bits 15..8 indicate the kind of result:
312 *    f  roundtrip result
313 *    c  fallback result from private-use code point
314 *    8  fallback result from other code points
315 *    0  unassigned
316 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
317 *
318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
321 * ASCII code points can be looked up with a linear array access into stage 3.
322 * See maxFastUChar and other details in ucnvmbcs.h.
323 *
324 * Multi-byte lookup:
325 *
326 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
328 *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
329 *             If this test is false, then a non-zero result will be interpreted as
330 *             a fallback mapping.
331 * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
332 *
333 * Stage 3 contains 2, 3, or 4 bytes per result.
334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
335 * while 3 bytes are stored as bytes in big-endian order.
336 * Leading zero bytes are ignored, and the number of bytes is counted.
337 * A zero byte mapping result is possible as a roundtrip result.
338 * For some output types, the actual result is processed from this;
339 * see ucnv_MBCSFromUnicodeWithOffsets().
340 *
341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
343 *
344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
347 * ASCII code points can be looked up with a linear array access into stage 3.
348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
349 *
350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
351 * for compaction.
352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
353 * may overlap by any number of entries.
354 *
355 * MBCS version 2 added:
356 * the converter checks for known output types, which allows
357 * adding new ones without crashing an unaware converter
358 */
359
360static const UConverterImpl _SBCSUTF8Impl;
361static const UConverterImpl _DBCSUTF8Impl;
362
363/* GB 18030 data ------------------------------------------------------------ */
364
365/* helper macros for linear values for GB 18030 four-byte sequences */
366#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
367
368#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
369
370#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
371
372/*
373 * Some ranges of GB 18030 where both the Unicode code points and the
374 * GB four-byte sequences are contiguous and are handled algorithmically by
375 * the special callback functions below.
376 * The values are start & end of Unicode & GB codes.
377 *
378 * Note that single surrogates are not mapped by GB 18030
379 * as of the re-released mapping tables from 2000-nov-30.
380 */
381static const uint32_t
382gb18030Ranges[13][4]={
383    {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
384    {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
385    {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
386    {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
387    {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
388    {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
389    {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
390    {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
391    {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
392    {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
393    {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
394    {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
395    {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
396};
397
398/* bit flag for UConverter.options indicating GB 18030 special handling */
399#define _MBCS_OPTION_GB18030 0x8000
400
401/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
402#define _MBCS_OPTION_KEIS 0x01000
403#define _MBCS_OPTION_JEF  0x02000
404#define _MBCS_OPTION_JIPS 0x04000
405
406#define KEIS_SO_CHAR_1 0x0A
407#define KEIS_SO_CHAR_2 0x42
408#define KEIS_SI_CHAR_1 0x0A
409#define KEIS_SI_CHAR_2 0x41
410
411#define JEF_SO_CHAR 0x28
412#define JEF_SI_CHAR 0x29
413
414#define JIPS_SO_CHAR_1 0x1A
415#define JIPS_SO_CHAR_2 0x70
416#define JIPS_SI_CHAR_1 0x1A
417#define JIPS_SI_CHAR_2 0x71
418
419enum SISO_Option {
420    SI,
421    SO
422};
423typedef enum SISO_Option SISO_Option;
424
425static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
426    int32_t SISOLength = 0;
427
428    switch (option) {
429        case SI:
430            if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
431                value[0] = KEIS_SI_CHAR_1;
432                value[1] = KEIS_SI_CHAR_2;
433                SISOLength = 2;
434            } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
435                value[0] = JEF_SI_CHAR;
436                SISOLength = 1;
437            } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
438                value[0] = JIPS_SI_CHAR_1;
439                value[1] = JIPS_SI_CHAR_2;
440                SISOLength = 2;
441            } else {
442                value[0] = UCNV_SI;
443                SISOLength = 1;
444            }
445            break;
446        case SO:
447            if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
448                value[0] = KEIS_SO_CHAR_1;
449                value[1] = KEIS_SO_CHAR_2;
450                SISOLength = 2;
451            } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
452                value[0] = JEF_SO_CHAR;
453                SISOLength = 1;
454            } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
455                value[0] = JIPS_SO_CHAR_1;
456                value[1] = JIPS_SO_CHAR_2;
457                SISOLength = 2;
458            } else {
459                value[0] = UCNV_SO;
460                SISOLength = 1;
461            }
462            break;
463        default:
464            /* Should never happen. */
465            break;
466    }
467
468    return SISOLength;
469}
470
471/* Miscellaneous ------------------------------------------------------------ */
472
473/**
474 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
475 * consecutive sequences of bytes, starting from the one encoded in value,
476 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
477 * Does not currently support m:n mappings or reverse fallbacks.
478 * This function will not be called for sequences of bytes with leading zeros.
479 *
480 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
481 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
482 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
483 *        not map to anything
484 * @return TRUE to continue enumeration, FALSE to stop
485 */
486typedef UBool U_CALLCONV
487UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
488
489/* similar to ucnv_MBCSGetNextUChar() but recursive */
490static UBool
491enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
492        int32_t state, uint32_t offset,
493        uint32_t value,
494        UConverterEnumToUCallback *callback, const void *context,
495        UErrorCode *pErrorCode) {
496    UChar32 codePoints[32];
497    const int32_t *row;
498    const uint16_t *unicodeCodeUnits;
499    UChar32 anyCodePoints;
500    int32_t b, limit;
501
502    row=mbcsTable->stateTable[state];
503    unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
504
505    value<<=8;
506    anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
507
508    b=(stateProps[state]&0x38)<<2;
509    if(b==0 && stateProps[state]>=0x40) {
510        /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
511        codePoints[0]=U_SENTINEL;
512        b=1;
513    }
514    limit=((stateProps[state]&7)+1)<<5;
515    while(b<limit) {
516        int32_t entry=row[b];
517        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
518            int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
519            if(stateProps[nextState]>=0) {
520                /* recurse to a state with non-ignorable actions */
521                if(!enumToU(
522                        mbcsTable, stateProps, nextState,
523                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
524                        value|(uint32_t)b,
525                        callback, context,
526                        pErrorCode)) {
527                    return FALSE;
528                }
529            }
530            codePoints[b&0x1f]=U_SENTINEL;
531        } else {
532            UChar32 c;
533            int32_t action;
534
535            /*
536             * An if-else-if chain provides more reliable performance for
537             * the most common cases compared to a switch.
538             */
539            action=MBCS_ENTRY_FINAL_ACTION(entry);
540            if(action==MBCS_STATE_VALID_DIRECT_16) {
541                /* output BMP code point */
542                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
543            } else if(action==MBCS_STATE_VALID_16) {
544                int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
545                c=unicodeCodeUnits[finalOffset];
546                if(c<0xfffe) {
547                    /* output BMP code point */
548                } else {
549                    c=U_SENTINEL;
550                }
551            } else if(action==MBCS_STATE_VALID_16_PAIR) {
552                int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
553                c=unicodeCodeUnits[finalOffset++];
554                if(c<0xd800) {
555                    /* output BMP code point below 0xd800 */
556                } else if(c<=0xdbff) {
557                    /* output roundtrip or fallback supplementary code point */
558                    c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
559                } else if(c==0xe000) {
560                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
561                    c=unicodeCodeUnits[finalOffset];
562                } else {
563                    c=U_SENTINEL;
564                }
565            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
566                /* output supplementary code point */
567                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
568            } else {
569                c=U_SENTINEL;
570            }
571
572            codePoints[b&0x1f]=c;
573            anyCodePoints&=c;
574        }
575        if(((++b)&0x1f)==0) {
576            if(anyCodePoints>=0) {
577                if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
578                    return FALSE;
579                }
580                anyCodePoints=-1;
581            }
582        }
583    }
584    return TRUE;
585}
586
587/*
588 * Only called if stateProps[state]==-1.
589 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
590 * MBCS_STATE_CHANGE_ONLY.
591 */
592static int8_t
593getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
594    const int32_t *row;
595    int32_t min, max, entry, nextState;
596
597    row=stateTable[state];
598    stateProps[state]=0;
599
600    /* find first non-ignorable state */
601    for(min=0;; ++min) {
602        entry=row[min];
603        nextState=MBCS_ENTRY_STATE(entry);
604        if(stateProps[nextState]==-1) {
605            getStateProp(stateTable, stateProps, nextState);
606        }
607        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
608            if(stateProps[nextState]>=0) {
609                break;
610            }
611        } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
612            break;
613        }
614        if(min==0xff) {
615            stateProps[state]=-0x40;  /* (int8_t)0xc0 */
616            return stateProps[state];
617        }
618    }
619    stateProps[state]|=(int8_t)((min>>5)<<3);
620
621    /* find last non-ignorable state */
622    for(max=0xff; min<max; --max) {
623        entry=row[max];
624        nextState=MBCS_ENTRY_STATE(entry);
625        if(stateProps[nextState]==-1) {
626            getStateProp(stateTable, stateProps, nextState);
627        }
628        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
629            if(stateProps[nextState]>=0) {
630                break;
631            }
632        } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
633            break;
634        }
635    }
636    stateProps[state]|=(int8_t)(max>>5);
637
638    /* recurse further and collect direct-state information */
639    while(min<=max) {
640        entry=row[min];
641        nextState=MBCS_ENTRY_STATE(entry);
642        if(stateProps[nextState]==-1) {
643            getStateProp(stateTable, stateProps, nextState);
644        }
645        if(MBCS_ENTRY_IS_FINAL(entry)) {
646            stateProps[nextState]|=0x40;
647            if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
648                stateProps[state]|=0x40;
649            }
650        }
651        ++min;
652    }
653    return stateProps[state];
654}
655
656/*
657 * Internal function enumerating the toUnicode data of an MBCS converter.
658 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
659 * table, but could also be used for a future ucnv_getUnicodeSet() option
660 * that includes reverse fallbacks (after updating this function's implementation).
661 * Currently only handles roundtrip mappings.
662 * Does not currently handle extensions.
663 */
664static void
665ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
666                       UConverterEnumToUCallback *callback, const void *context,
667                       UErrorCode *pErrorCode) {
668    /*
669     * Properties for each state, to speed up the enumeration.
670     * Ignorable actions are unassigned/illegal/state-change-only:
671     * They do not lead to mappings.
672     *
673     * Bits 7..6:
674     * 1 direct/initial state (stateful converters have multiple)
675     * 0 non-initial state with transitions or with non-ignorable result actions
676     * -1 final state with only ignorable actions
677     *
678     * Bits 5..3:
679     * The lowest byte value with non-ignorable actions is
680     * value<<5 (rounded down).
681     *
682     * Bits 2..0:
683     * The highest byte value with non-ignorable actions is
684     * (value<<5)&0x1f (rounded up).
685     */
686    int8_t stateProps[MBCS_MAX_STATE_COUNT];
687    int32_t state;
688
689    uprv_memset(stateProps, -1, sizeof(stateProps));
690
691    /* recurse from state 0 and set all stateProps */
692    getStateProp(mbcsTable->stateTable, stateProps, 0);
693
694    for(state=0; state<mbcsTable->countStates; ++state) {
695        /*if(stateProps[state]==-1) {
696            printf("unused/unreachable <icu:state> %d\n", state);
697        }*/
698        if(stateProps[state]>=0x40) {
699            /* start from each direct state */
700            enumToU(
701                mbcsTable, stateProps, state, 0, 0,
702                callback, context,
703                pErrorCode);
704        }
705    }
706}
707
708U_CFUNC void
709ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
710                                         const USetAdder *sa,
711                                         UConverterUnicodeSet which,
712                                         UConverterSetFilter filter,
713                                         UErrorCode *pErrorCode) {
714    const UConverterMBCSTable *mbcsTable;
715    const uint16_t *table;
716
717    uint32_t st3;
718    uint16_t st1, maxStage1, st2;
719
720    UChar32 c;
721
722    /* enumerate the from-Unicode trie table */
723    mbcsTable=&sharedData->mbcs;
724    table=mbcsTable->fromUnicodeTable;
725    if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
726        maxStage1=0x440;
727    } else {
728        maxStage1=0x40;
729    }
730
731    c=0; /* keep track of the current code point while enumerating */
732
733    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
734        const uint16_t *stage2, *stage3, *results;
735        uint16_t minValue;
736
737        results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
738
739        /*
740         * Set a threshold variable for selecting which mappings to use.
741         * See ucnv_MBCSSingleFromBMPWithOffsets() and
742         * MBCS_SINGLE_RESULT_FROM_U() for details.
743         */
744        if(which==UCNV_ROUNDTRIP_SET) {
745            /* use only roundtrips */
746            minValue=0xf00;
747        } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
748            /* use all roundtrip and fallback results */
749            minValue=0x800;
750        }
751
752        for(st1=0; st1<maxStage1; ++st1) {
753            st2=table[st1];
754            if(st2>maxStage1) {
755                stage2=table+st2;
756                for(st2=0; st2<64; ++st2) {
757                    if((st3=stage2[st2])!=0) {
758                        /* read the stage 3 block */
759                        stage3=results+st3;
760
761                        do {
762                            if(*stage3++>=minValue) {
763                                sa->add(sa->set, c);
764                            }
765                        } while((++c&0xf)!=0);
766                    } else {
767                        c+=16; /* empty stage 3 block */
768                    }
769                }
770            } else {
771                c+=1024; /* empty stage 2 block */
772            }
773        }
774    } else {
775        const uint32_t *stage2;
776        const uint8_t *stage3, *bytes;
777        uint32_t st3Multiplier;
778        uint32_t value;
779        UBool useFallback;
780
781        bytes=mbcsTable->fromUnicodeBytes;
782
783        useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
784
785        switch(mbcsTable->outputType) {
786        case MBCS_OUTPUT_3:
787        case MBCS_OUTPUT_4_EUC:
788            st3Multiplier=3;
789            break;
790        case MBCS_OUTPUT_4:
791            st3Multiplier=4;
792            break;
793        default:
794            st3Multiplier=2;
795            break;
796        }
797
798        for(st1=0; st1<maxStage1; ++st1) {
799            st2=table[st1];
800            if(st2>(maxStage1>>1)) {
801                stage2=(const uint32_t *)table+st2;
802                for(st2=0; st2<64; ++st2) {
803                    if((st3=stage2[st2])!=0) {
804                        /* read the stage 3 block */
805                        stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
806
807                        /* get the roundtrip flags for the stage 3 block */
808                        st3>>=16;
809
810                        /*
811                         * Add code points for which the roundtrip flag is set,
812                         * or which map to non-zero bytes if we use fallbacks.
813                         * See ucnv_MBCSFromUnicodeWithOffsets() for details.
814                         */
815                        switch(filter) {
816                        case UCNV_SET_FILTER_NONE:
817                            do {
818                                if(st3&1) {
819                                    sa->add(sa->set, c);
820                                    stage3+=st3Multiplier;
821                                } else if(useFallback) {
822                                    uint8_t b=0;
823                                    switch(st3Multiplier) {
824                                    case 4:
825                                        b|=*stage3++;
826                                    case 3:
827                                        b|=*stage3++;
828                                    case 2:
829                                        b|=stage3[0]|stage3[1];
830                                        stage3+=2;
831                                    default:
832                                        break;
833                                    }
834                                    if(b!=0) {
835                                        sa->add(sa->set, c);
836                                    }
837                                }
838                                st3>>=1;
839                            } while((++c&0xf)!=0);
840                            break;
841                        case UCNV_SET_FILTER_DBCS_ONLY:
842                             /* Ignore single-byte results (<0x100). */
843                            do {
844                                if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
845                                    sa->add(sa->set, c);
846                                }
847                                st3>>=1;
848                                stage3+=2;  /* +=st3Multiplier */
849                            } while((++c&0xf)!=0);
850                            break;
851                        case UCNV_SET_FILTER_2022_CN:
852                             /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
853                            do {
854                                if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
855                                    sa->add(sa->set, c);
856                                }
857                                st3>>=1;
858                                stage3+=3;  /* +=st3Multiplier */
859                            } while((++c&0xf)!=0);
860                            break;
861                        case UCNV_SET_FILTER_SJIS:
862                             /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
863                            do {
864                                if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
865                                    sa->add(sa->set, c);
866                                }
867                                st3>>=1;
868                                stage3+=2;  /* +=st3Multiplier */
869                            } while((++c&0xf)!=0);
870                            break;
871                        case UCNV_SET_FILTER_GR94DBCS:
872                            /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
873                            do {
874                                if( ((st3&1)!=0 || useFallback) &&
875                                    (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
876                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
877                                ) {
878                                    sa->add(sa->set, c);
879                                }
880                                st3>>=1;
881                                stage3+=2;  /* +=st3Multiplier */
882                            } while((++c&0xf)!=0);
883                            break;
884                        case UCNV_SET_FILTER_HZ:
885                            /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
886                            do {
887                                if( ((st3&1)!=0 || useFallback) &&
888                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
889                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
890                                ) {
891                                    sa->add(sa->set, c);
892                                }
893                                st3>>=1;
894                                stage3+=2;  /* +=st3Multiplier */
895                            } while((++c&0xf)!=0);
896                            break;
897                        default:
898                            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
899                            return;
900                        }
901                    } else {
902                        c+=16; /* empty stage 3 block */
903                    }
904                }
905            } else {
906                c+=1024; /* empty stage 2 block */
907            }
908        }
909    }
910
911    ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
912}
913
914U_CFUNC void
915ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
916                                 const USetAdder *sa,
917                                 UConverterUnicodeSet which,
918                                 UErrorCode *pErrorCode) {
919    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
920        sharedData, sa, which,
921        sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
922            UCNV_SET_FILTER_DBCS_ONLY :
923            UCNV_SET_FILTER_NONE,
924        pErrorCode);
925}
926
927static void
928ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
929                   const USetAdder *sa,
930                   UConverterUnicodeSet which,
931                   UErrorCode *pErrorCode) {
932    if(cnv->options&_MBCS_OPTION_GB18030) {
933        sa->addRange(sa->set, 0, 0xd7ff);
934        sa->addRange(sa->set, 0xe000, 0x10ffff);
935    } else {
936        ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
937    }
938}
939
940/* conversion extensions for input not in the main table -------------------- */
941
942/*
943 * Hardcoded extension handling for GB 18030.
944 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
945 *
946 * In the future, conversion extensions may handle m:n mappings and delta tables,
947 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
948 *
949 * If an input character cannot be mapped, then these functions set an error
950 * code. The framework will then call the callback function.
951 */
952
953/*
954 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
955 *         else return 0 after output has been written to the target
956 */
957static UChar32
958_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
959          UChar32 cp,
960          const UChar **source, const UChar *sourceLimit,
961          uint8_t **target, const uint8_t *targetLimit,
962          int32_t **offsets, int32_t sourceIndex,
963          UBool flush,
964          UErrorCode *pErrorCode) {
965    const int32_t *cx;
966
967    cnv->useSubChar1=FALSE;
968
969    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
970        ucnv_extInitialMatchFromU(
971            cnv, cx,
972            cp, source, sourceLimit,
973            (char **)target, (char *)targetLimit,
974            offsets, sourceIndex,
975            flush,
976            pErrorCode)
977    ) {
978        return 0; /* an extension mapping handled the input */
979    }
980
981    /* GB 18030 */
982    if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
983        const uint32_t *range;
984        int32_t i;
985
986        range=gb18030Ranges[0];
987        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
988            if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
989                /* found the Unicode code point, output the four-byte sequence for it */
990                uint32_t linear;
991                char bytes[4];
992
993                /* get the linear value of the first GB 18030 code in this range */
994                linear=range[2]-LINEAR_18030_BASE;
995
996                /* add the offset from the beginning of the range */
997                linear+=((uint32_t)cp-range[0]);
998
999                /* turn this into a four-byte sequence */
1000                bytes[3]=(char)(0x30+linear%10); linear/=10;
1001                bytes[2]=(char)(0x81+linear%126); linear/=126;
1002                bytes[1]=(char)(0x30+linear%10); linear/=10;
1003                bytes[0]=(char)(0x81+linear);
1004
1005                /* output this sequence */
1006                ucnv_fromUWriteBytes(cnv,
1007                                     bytes, 4, (char **)target, (char *)targetLimit,
1008                                     offsets, sourceIndex, pErrorCode);
1009                return 0;
1010            }
1011        }
1012    }
1013
1014    /* no mapping */
1015    *pErrorCode=U_INVALID_CHAR_FOUND;
1016    return cp;
1017}
1018
1019/*
1020 * Input sequence: cnv->toUBytes[0..length[
1021 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1022 *         else return 0 after output has been written to the target
1023 */
1024static int8_t
1025_extToU(UConverter *cnv, const UConverterSharedData *sharedData,
1026        int8_t length,
1027        const uint8_t **source, const uint8_t *sourceLimit,
1028        UChar **target, const UChar *targetLimit,
1029        int32_t **offsets, int32_t sourceIndex,
1030        UBool flush,
1031        UErrorCode *pErrorCode) {
1032    const int32_t *cx;
1033
1034    if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1035        ucnv_extInitialMatchToU(
1036            cnv, cx,
1037            length, (const char **)source, (const char *)sourceLimit,
1038            target, targetLimit,
1039            offsets, sourceIndex,
1040            flush,
1041            pErrorCode)
1042    ) {
1043        return 0; /* an extension mapping handled the input */
1044    }
1045
1046    /* GB 18030 */
1047    if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
1048        const uint32_t *range;
1049        uint32_t linear;
1050        int32_t i;
1051
1052        linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
1053        range=gb18030Ranges[0];
1054        for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
1055            if(range[2]<=linear && linear<=range[3]) {
1056                /* found the sequence, output the Unicode code point for it */
1057                *pErrorCode=U_ZERO_ERROR;
1058
1059                /* add the linear difference between the input and start sequences to the start code point */
1060                linear=range[0]+(linear-range[2]);
1061
1062                /* output this code point */
1063                ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
1064
1065                return 0;
1066            }
1067        }
1068    }
1069
1070    /* no mapping */
1071    *pErrorCode=U_INVALID_CHAR_FOUND;
1072    return length;
1073}
1074
1075/* EBCDIC swap LF<->NL ------------------------------------------------------ */
1076
1077/*
1078 * This code modifies a standard EBCDIC<->Unicode mapping table for
1079 * OS/390 (z/OS) Unix System Services (Open Edition).
1080 * The difference is in the mapping of Line Feed and New Line control codes:
1081 * Standard EBCDIC maps
1082 *
1083 *   <U000A> \x25 |0
1084 *   <U0085> \x15 |0
1085 *
1086 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1087 * mapping
1088 *
1089 *   <U000A> \x15 |0
1090 *   <U0085> \x25 |0
1091 *
1092 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1093 * by copying it into allocated memory and swapping the LF and NL values.
1094 * It allows to support the same EBCDIC charset in both versions without
1095 * duplicating the entire installed table.
1096 */
1097
1098/* standard EBCDIC codes */
1099#define EBCDIC_LF 0x25
1100#define EBCDIC_NL 0x15
1101
1102/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1103#define EBCDIC_RT_LF 0xf25
1104#define EBCDIC_RT_NL 0xf15
1105
1106/* Unicode code points */
1107#define U_LF 0x0a
1108#define U_NL 0x85
1109
1110static UBool
1111_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
1112    UConverterMBCSTable *mbcsTable;
1113
1114    const uint16_t *table, *results;
1115    const uint8_t *bytes;
1116
1117    int32_t (*newStateTable)[256];
1118    uint16_t *newResults;
1119    uint8_t *p;
1120    char *name;
1121
1122    uint32_t stage2Entry;
1123    uint32_t size, sizeofFromUBytes;
1124
1125    mbcsTable=&sharedData->mbcs;
1126
1127    table=mbcsTable->fromUnicodeTable;
1128    bytes=mbcsTable->fromUnicodeBytes;
1129    results=(const uint16_t *)bytes;
1130
1131    /*
1132     * Check that this is an EBCDIC table with SBCS portion -
1133     * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1134     *
1135     * If not, ignore the option. Options are always ignored if they do not apply.
1136     */
1137    if(!(
1138         (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1139         mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1140         mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
1141    )) {
1142        return FALSE;
1143    }
1144
1145    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1146        if(!(
1147             EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1148             EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1149        )) {
1150            return FALSE;
1151        }
1152    } else /* MBCS_OUTPUT_2_SISO */ {
1153        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1154        if(!(
1155             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1156             EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1157        )) {
1158            return FALSE;
1159        }
1160
1161        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1162        if(!(
1163             MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1164             EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1165        )) {
1166            return FALSE;
1167        }
1168    }
1169
1170    if(mbcsTable->fromUBytesLength>0) {
1171        /*
1172         * We _know_ the number of bytes in the fromUnicodeBytes array
1173         * starting with header.version 4.1.
1174         */
1175        sizeofFromUBytes=mbcsTable->fromUBytesLength;
1176    } else {
1177        /*
1178         * Otherwise:
1179         * There used to be code to enumerate the fromUnicode
1180         * trie and find the highest entry, but it was removed in ICU 3.2
1181         * because it was not tested and caused a low code coverage number.
1182         * See Jitterbug 3674.
1183         * This affects only some .cnv file formats with a header.version
1184         * below 4.1, and only when swaplfnl is requested.
1185         *
1186         * ucnvmbcs.c revision 1.99 is the last one with the
1187         * ucnv_MBCSSizeofFromUBytes() function.
1188         */
1189        *pErrorCode=U_INVALID_FORMAT_ERROR;
1190        return FALSE;
1191    }
1192
1193    /*
1194     * The table has an appropriate format.
1195     * Allocate and build
1196     * - a modified to-Unicode state table
1197     * - a modified from-Unicode output array
1198     * - a converter name string with the swap option appended
1199     */
1200    size=
1201        mbcsTable->countStates*1024+
1202        sizeofFromUBytes+
1203        UCNV_MAX_CONVERTER_NAME_LENGTH+20;
1204    p=(uint8_t *)uprv_malloc(size);
1205    if(p==NULL) {
1206        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1207        return FALSE;
1208    }
1209
1210    /* copy and modify the to-Unicode state table */
1211    newStateTable=(int32_t (*)[256])p;
1212    uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1213
1214    newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1215    newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1216
1217    /* copy and modify the from-Unicode result table */
1218    newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1219    uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1220
1221    /* conveniently, the table access macros work on the left side of expressions */
1222    if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1223        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1224        MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1225    } else /* MBCS_OUTPUT_2_SISO */ {
1226        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1227        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1228
1229        stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1230        MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1231    }
1232
1233    /* set the canonical converter name */
1234    name=(char *)newResults+sizeofFromUBytes;
1235    uprv_strcpy(name, sharedData->staticData->name);
1236    uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1237
1238    /* set the pointers */
1239    umtx_lock(NULL);
1240    if(mbcsTable->swapLFNLStateTable==NULL) {
1241        mbcsTable->swapLFNLStateTable=newStateTable;
1242        mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1243        mbcsTable->swapLFNLName=name;
1244
1245        newStateTable=NULL;
1246    }
1247    umtx_unlock(NULL);
1248
1249    /* release the allocated memory if another thread beat us to it */
1250    if(newStateTable!=NULL) {
1251        uprv_free(newStateTable);
1252    }
1253    return TRUE;
1254}
1255
1256/* reconstitute omitted fromUnicode data ------------------------------------ */
1257
1258/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1259static UBool U_CALLCONV
1260writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1261    UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
1262    const uint16_t *table;
1263    uint32_t *stage2;
1264    uint8_t *bytes, *p;
1265    UChar32 c;
1266    int32_t i, st3;
1267
1268    table=mbcsTable->fromUnicodeTable;
1269    bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1270
1271    /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1272    switch(mbcsTable->outputType) {
1273    case MBCS_OUTPUT_3_EUC:
1274        if(value<=0xffff) {
1275            /* short sequences are stored directly */
1276            /* code set 0 or 1 */
1277        } else if(value<=0x8effff) {
1278            /* code set 2 */
1279            value&=0x7fff;
1280        } else /* first byte is 0x8f */ {
1281            /* code set 3 */
1282            value&=0xff7f;
1283        }
1284        break;
1285    case MBCS_OUTPUT_4_EUC:
1286        if(value<=0xffffff) {
1287            /* short sequences are stored directly */
1288            /* code set 0 or 1 */
1289        } else if(value<=0x8effffff) {
1290            /* code set 2 */
1291            value&=0x7fffff;
1292        } else /* first byte is 0x8f */ {
1293            /* code set 3 */
1294            value&=0xff7fff;
1295        }
1296        break;
1297    default:
1298        break;
1299    }
1300
1301    for(i=0; i<=0x1f; ++value, ++i) {
1302        c=codePoints[i];
1303        if(c<0) {
1304            continue;
1305        }
1306
1307        /* locate the stage 2 & 3 data */
1308        stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1309        p=bytes;
1310        st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1311
1312        /* write the codepage bytes into stage 3 */
1313        switch(mbcsTable->outputType) {
1314        case MBCS_OUTPUT_3:
1315        case MBCS_OUTPUT_4_EUC:
1316            p+=st3*3;
1317            p[0]=(uint8_t)(value>>16);
1318            p[1]=(uint8_t)(value>>8);
1319            p[2]=(uint8_t)value;
1320            break;
1321        case MBCS_OUTPUT_4:
1322            ((uint32_t *)p)[st3]=value;
1323            break;
1324        default:
1325            /* 2 bytes per character */
1326            ((uint16_t *)p)[st3]=(uint16_t)value;
1327            break;
1328        }
1329
1330        /* set the roundtrip flag */
1331        *stage2|=(1UL<<(16+(c&0xf)));
1332    }
1333    return TRUE;
1334 }
1335
1336static void
1337reconstituteData(UConverterMBCSTable *mbcsTable,
1338                 uint32_t stage1Length, uint32_t stage2Length,
1339                 uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
1340                 UErrorCode *pErrorCode) {
1341    uint16_t *stage1;
1342    uint32_t *stage2;
1343    uint8_t *bytes;
1344    uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1345    mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1346    if(mbcsTable->reconstitutedData==NULL) {
1347        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1348        return;
1349    }
1350    uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1351
1352    /* copy existing data and reroute the pointers */
1353    stage1=(uint16_t *)mbcsTable->reconstitutedData;
1354    uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1355
1356    stage2=(uint32_t *)(stage1+stage1Length);
1357    uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1358                mbcsTable->fromUnicodeTable+stage1Length,
1359                stage2Length*4);
1360
1361    mbcsTable->fromUnicodeTable=stage1;
1362    mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
1363
1364    /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1365    stage2=(uint32_t *)stage1;
1366
1367    /* reconstitute the initial part of stage 2 from the mbcsIndex */
1368    {
1369        int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1370        int32_t stageUTF8Index=0;
1371        int32_t st1, st2, st3, i;
1372
1373        for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1374            st2=stage1[st1];
1375            if(st2!=stage1Length/2) {
1376                /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1377                for(i=0; i<16; ++i) {
1378                    st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1379                    if(st3!=0) {
1380                        /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1381                        st3>>=4;
1382                        /*
1383                         * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1384                         * allocated together as a single 64-block for access from the mbcsIndex
1385                         */
1386                        stage2[st2++]=st3++;
1387                        stage2[st2++]=st3++;
1388                        stage2[st2++]=st3++;
1389                        stage2[st2++]=st3;
1390                    } else {
1391                        /* no stage 3 block, skip */
1392                        st2+=4;
1393                    }
1394                }
1395            } else {
1396                /* no stage 2 block, skip */
1397                stageUTF8Index+=16;
1398            }
1399        }
1400    }
1401
1402    /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1403    ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1404}
1405
1406/* MBCS setup functions ----------------------------------------------------- */
1407
1408static void
1409ucnv_MBCSLoad(UConverterSharedData *sharedData,
1410          UConverterLoadArgs *pArgs,
1411          const uint8_t *raw,
1412          UErrorCode *pErrorCode) {
1413    UDataInfo info;
1414    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1415    _MBCSHeader *header=(_MBCSHeader *)raw;
1416    uint32_t offset;
1417    uint32_t headerLength;
1418    UBool noFromU=FALSE;
1419
1420    if(header->version[0]==4) {
1421        headerLength=MBCS_HEADER_V4_LENGTH;
1422    } else if(header->version[0]==5 && header->version[1]>=3 &&
1423              (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
1424        headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1425        noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1426    } else {
1427        *pErrorCode=U_INVALID_TABLE_FORMAT;
1428        return;
1429    }
1430
1431    mbcsTable->outputType=(uint8_t)header->flags;
1432    if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1433        *pErrorCode=U_INVALID_TABLE_FORMAT;
1434        return;
1435    }
1436
1437    /* extension data, header version 4.2 and higher */
1438    offset=header->flags>>8;
1439    if(offset!=0) {
1440        mbcsTable->extIndexes=(const int32_t *)(raw+offset);
1441    }
1442
1443    if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1444        UConverterLoadArgs args={ 0 };
1445        UConverterSharedData *baseSharedData;
1446        const int32_t *extIndexes;
1447        const char *baseName;
1448
1449        /* extension-only file, load the base table and set values appropriately */
1450        if((extIndexes=mbcsTable->extIndexes)==NULL) {
1451            /* extension-only file without extension */
1452            *pErrorCode=U_INVALID_TABLE_FORMAT;
1453            return;
1454        }
1455
1456        if(pArgs->nestedLoads!=1) {
1457            /* an extension table must not be loaded as a base table */
1458            *pErrorCode=U_INVALID_TABLE_FILE;
1459            return;
1460        }
1461
1462        /* load the base table */
1463        baseName=(const char *)header+headerLength*4;
1464        if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1465            /* forbid loading this same extension-only file */
1466            *pErrorCode=U_INVALID_TABLE_FORMAT;
1467            return;
1468        }
1469
1470        /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1471        args.size=sizeof(UConverterLoadArgs);
1472        args.nestedLoads=2;
1473        args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
1474        args.reserved=pArgs->reserved;
1475        args.options=pArgs->options;
1476        args.pkg=pArgs->pkg;
1477        args.name=baseName;
1478        baseSharedData=ucnv_load(&args, pErrorCode);
1479        if(U_FAILURE(*pErrorCode)) {
1480            return;
1481        }
1482        if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1483            baseSharedData->mbcs.baseSharedData!=NULL
1484        ) {
1485            ucnv_unload(baseSharedData);
1486            *pErrorCode=U_INVALID_TABLE_FORMAT;
1487            return;
1488        }
1489        if(pArgs->onlyTestIsLoadable) {
1490            /*
1491             * Exit as soon as we know that we can load the converter
1492             * and the format is valid and supported.
1493             * The worst that can happen in the following code is a memory
1494             * allocation error.
1495             */
1496            ucnv_unload(baseSharedData);
1497            return;
1498        }
1499
1500        /* copy the base table data */
1501        uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1502
1503        /* overwrite values with relevant ones for the extension converter */
1504        mbcsTable->baseSharedData=baseSharedData;
1505        mbcsTable->extIndexes=extIndexes;
1506
1507        /*
1508         * It would be possible to share the swapLFNL data with a base converter,
1509         * but the generated name would have to be different, and the memory
1510         * would have to be free'd only once.
1511         * It is easier to just create the data for the extension converter
1512         * separately when it is requested.
1513         */
1514        mbcsTable->swapLFNLStateTable=NULL;
1515        mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1516        mbcsTable->swapLFNLName=NULL;
1517
1518        /*
1519         * The reconstitutedData must be deleted only when the base converter
1520         * is unloaded.
1521         */
1522        mbcsTable->reconstitutedData=NULL;
1523
1524        /*
1525         * Set a special, runtime-only outputType if the extension converter
1526         * is a DBCS version of a base converter that also maps single bytes.
1527         */
1528        if( sharedData->staticData->conversionType==UCNV_DBCS ||
1529                (sharedData->staticData->conversionType==UCNV_MBCS &&
1530                 sharedData->staticData->minBytesPerChar>=2)
1531        ) {
1532            if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1533                /* the base converter is SI/SO-stateful */
1534                int32_t entry;
1535
1536                /* get the dbcs state from the state table entry for SO=0x0e */
1537                entry=mbcsTable->stateTable[0][0xe];
1538                if( MBCS_ENTRY_IS_FINAL(entry) &&
1539                    MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1540                    MBCS_ENTRY_FINAL_STATE(entry)!=0
1541                ) {
1542                    mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1543
1544                    mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1545                }
1546            } else if(
1547                baseSharedData->staticData->conversionType==UCNV_MBCS &&
1548                baseSharedData->staticData->minBytesPerChar==1 &&
1549                baseSharedData->staticData->maxBytesPerChar==2 &&
1550                mbcsTable->countStates<=127
1551            ) {
1552                /* non-stateful base converter, need to modify the state table */
1553                int32_t (*newStateTable)[256];
1554                int32_t *state;
1555                int32_t i, count;
1556
1557                /* allocate a new state table and copy the base state table contents */
1558                count=mbcsTable->countStates;
1559                newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1560                if(newStateTable==NULL) {
1561                    ucnv_unload(baseSharedData);
1562                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1563                    return;
1564                }
1565
1566                uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1567
1568                /* change all final single-byte entries to go to a new all-illegal state */
1569                state=newStateTable[0];
1570                for(i=0; i<256; ++i) {
1571                    if(MBCS_ENTRY_IS_FINAL(state[i])) {
1572                        state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1573                    }
1574                }
1575
1576                /* build the new all-illegal state */
1577                state=newStateTable[count];
1578                for(i=0; i<256; ++i) {
1579                    state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1580                }
1581                mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1582                mbcsTable->countStates=(uint8_t)(count+1);
1583                mbcsTable->stateTableOwned=TRUE;
1584
1585                mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1586            }
1587        }
1588
1589        /*
1590         * unlike below for files with base tables, do not get the unicodeMask
1591         * from the sharedData; instead, use the base table's unicodeMask,
1592         * which we copied in the memcpy above;
1593         * this is necessary because the static data unicodeMask, especially
1594         * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1595         */
1596    } else {
1597        /* conversion file with a base table; an additional extension table is optional */
1598        /* make sure that the output type is known */
1599        switch(mbcsTable->outputType) {
1600        case MBCS_OUTPUT_1:
1601        case MBCS_OUTPUT_2:
1602        case MBCS_OUTPUT_3:
1603        case MBCS_OUTPUT_4:
1604        case MBCS_OUTPUT_3_EUC:
1605        case MBCS_OUTPUT_4_EUC:
1606        case MBCS_OUTPUT_2_SISO:
1607            /* OK */
1608            break;
1609        default:
1610            *pErrorCode=U_INVALID_TABLE_FORMAT;
1611            return;
1612        }
1613        if(pArgs->onlyTestIsLoadable) {
1614            /*
1615             * Exit as soon as we know that we can load the converter
1616             * and the format is valid and supported.
1617             * The worst that can happen in the following code is a memory
1618             * allocation error.
1619             */
1620            return;
1621        }
1622
1623        mbcsTable->countStates=(uint8_t)header->countStates;
1624        mbcsTable->countToUFallbacks=header->countToUFallbacks;
1625        mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
1626        mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1627        mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1628
1629        mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1630        mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1631        mbcsTable->fromUBytesLength=header->fromUBytesLength;
1632
1633        /*
1634         * converter versions 6.1 and up contain a unicodeMask that is
1635         * used here to select the most efficient function implementations
1636         */
1637        info.size=sizeof(UDataInfo);
1638        udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1639        if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1640            /* mask off possible future extensions to be safe */
1641            mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1642        } else {
1643            /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1644            mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1645        }
1646
1647        /*
1648         * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1649         * Check for the header version, SBCS vs. MBCS, and for whether the
1650         * data structures are optimized for code points as high as what the
1651         * runtime code is designed for.
1652         * The implementation does not handle mapping tables with entries for
1653         * unpaired surrogates.
1654         */
1655        if( header->version[1]>=3 &&
1656            (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1657            (mbcsTable->countStates==1 ?
1658                (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1659                (header->version[2]>=(MBCS_FAST_MAX>>8))
1660            )
1661        ) {
1662            mbcsTable->utf8Friendly=TRUE;
1663
1664            if(mbcsTable->countStates==1) {
1665                /*
1666                 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1667                 * Build a table with indexes to each block, to be used instead of
1668                 * the regular stage 1/2 table.
1669                 */
1670                int32_t i;
1671                for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1672                    mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1673                }
1674                /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1675                mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1676            } else {
1677                /*
1678                 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1679                 * The .cnv file is prebuilt with an additional stage table with indexes
1680                 * to each block.
1681                 */
1682                mbcsTable->mbcsIndex=(const uint16_t *)
1683                    (mbcsTable->fromUnicodeBytes+
1684                     (noFromU ? 0 : mbcsTable->fromUBytesLength));
1685                mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1686            }
1687        }
1688
1689        /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1690        {
1691            uint32_t asciiRoundtrips=0xffffffff;
1692            int32_t i;
1693
1694            for(i=0; i<0x80; ++i) {
1695                if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1696                    asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1697                }
1698            }
1699            mbcsTable->asciiRoundtrips=asciiRoundtrips;
1700        }
1701
1702        if(noFromU) {
1703            uint32_t stage1Length=
1704                mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1705                    0x440 : 0x40;
1706            uint32_t stage2Length=
1707                (header->offsetFromUBytes-header->offsetFromUTable)/4-
1708                stage1Length/2;
1709            reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1710        }
1711    }
1712
1713    /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1714    if(mbcsTable->utf8Friendly) {
1715        if(mbcsTable->countStates==1) {
1716            sharedData->impl=&_SBCSUTF8Impl;
1717        } else {
1718            if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1719                sharedData->impl=&_DBCSUTF8Impl;
1720            }
1721        }
1722    }
1723
1724    if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1725        /*
1726         * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1727         * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1728         */
1729        mbcsTable->asciiRoundtrips=0;
1730    }
1731}
1732
1733static void
1734ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1735    UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1736
1737    if(mbcsTable->swapLFNLStateTable!=NULL) {
1738        uprv_free(mbcsTable->swapLFNLStateTable);
1739    }
1740    if(mbcsTable->stateTableOwned) {
1741        uprv_free((void *)mbcsTable->stateTable);
1742    }
1743    if(mbcsTable->baseSharedData!=NULL) {
1744        ucnv_unload(mbcsTable->baseSharedData);
1745    }
1746    if(mbcsTable->reconstitutedData!=NULL) {
1747        uprv_free(mbcsTable->reconstitutedData);
1748    }
1749}
1750
1751static void
1752ucnv_MBCSOpen(UConverter *cnv,
1753              UConverterLoadArgs *pArgs,
1754              UErrorCode *pErrorCode) {
1755    UConverterMBCSTable *mbcsTable;
1756    const int32_t *extIndexes;
1757    uint8_t outputType;
1758    int8_t maxBytesPerUChar;
1759
1760    if(pArgs->onlyTestIsLoadable) {
1761        return;
1762    }
1763
1764    mbcsTable=&cnv->sharedData->mbcs;
1765    outputType=mbcsTable->outputType;
1766
1767    if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1768        /* the swaplfnl option does not apply, remove it */
1769        cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1770    }
1771
1772    if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1773        /* do this because double-checked locking is broken */
1774        UBool isCached;
1775
1776        umtx_lock(NULL);
1777        isCached=mbcsTable->swapLFNLStateTable!=NULL;
1778        umtx_unlock(NULL);
1779
1780        if(!isCached) {
1781            if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1782                if(U_FAILURE(*pErrorCode)) {
1783                    return; /* something went wrong */
1784                }
1785
1786                /* the option does not apply, remove it */
1787                cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1788            }
1789        }
1790    }
1791
1792    if(uprv_strstr(pArgs->name, "18030")!=NULL) {
1793        if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
1794            /* set a flag for GB 18030 mode, which changes the callback behavior */
1795            cnv->options|=_MBCS_OPTION_GB18030;
1796        }
1797    } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
1798        /* set a flag for KEIS converter, which changes the SI/SO character sequence */
1799        cnv->options|=_MBCS_OPTION_KEIS;
1800    } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
1801        /* set a flag for JEF converter, which changes the SI/SO character sequence */
1802        cnv->options|=_MBCS_OPTION_JEF;
1803    } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
1804        /* set a flag for JIPS converter, which changes the SI/SO character sequence */
1805        cnv->options|=_MBCS_OPTION_JIPS;
1806    }
1807
1808    /* fix maxBytesPerUChar depending on outputType and options etc. */
1809    if(outputType==MBCS_OUTPUT_2_SISO) {
1810        cnv->maxBytesPerUChar=3; /* SO+DBCS */
1811    }
1812
1813    extIndexes=mbcsTable->extIndexes;
1814    if(extIndexes!=NULL) {
1815        maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1816        if(outputType==MBCS_OUTPUT_2_SISO) {
1817            ++maxBytesPerUChar; /* SO + multiple DBCS */
1818        }
1819
1820        if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1821            cnv->maxBytesPerUChar=maxBytesPerUChar;
1822        }
1823    }
1824
1825#if 0
1826    /*
1827     * documentation of UConverter fields used for status
1828     * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1829     */
1830
1831    /* toUnicode */
1832    cnv->toUnicodeStatus=0;     /* offset */
1833    cnv->mode=0;                /* state */
1834    cnv->toULength=0;           /* byteIndex */
1835
1836    /* fromUnicode */
1837    cnv->fromUChar32=0;
1838    cnv->fromUnicodeStatus=1;   /* prevLength */
1839#endif
1840}
1841
1842static const char *
1843ucnv_MBCSGetName(const UConverter *cnv) {
1844    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1845        return cnv->sharedData->mbcs.swapLFNLName;
1846    } else {
1847        return cnv->sharedData->staticData->name;
1848    }
1849}
1850
1851/* MBCS-to-Unicode conversion functions ------------------------------------- */
1852
1853static UChar32
1854ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
1855    const _MBCSToUFallback *toUFallbacks;
1856    uint32_t i, start, limit;
1857
1858    limit=mbcsTable->countToUFallbacks;
1859    if(limit>0) {
1860        /* do a binary search for the fallback mapping */
1861        toUFallbacks=mbcsTable->toUFallbacks;
1862        start=0;
1863        while(start<limit-1) {
1864            i=(start+limit)/2;
1865            if(offset<toUFallbacks[i].offset) {
1866                limit=i;
1867            } else {
1868                start=i;
1869            }
1870        }
1871
1872        /* did we really find it? */
1873        if(offset==toUFallbacks[start].offset) {
1874            return toUFallbacks[start].codePoint;
1875        }
1876    }
1877
1878    return 0xfffe;
1879}
1880
1881/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1882static void
1883ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1884                                UErrorCode *pErrorCode) {
1885    UConverter *cnv;
1886    const uint8_t *source, *sourceLimit;
1887    UChar *target;
1888    const UChar *targetLimit;
1889    int32_t *offsets;
1890
1891    const int32_t (*stateTable)[256];
1892
1893    int32_t sourceIndex;
1894
1895    int32_t entry;
1896    UChar c;
1897    uint8_t action;
1898
1899    /* set up the local pointers */
1900    cnv=pArgs->converter;
1901    source=(const uint8_t *)pArgs->source;
1902    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1903    target=pArgs->target;
1904    targetLimit=pArgs->targetLimit;
1905    offsets=pArgs->offsets;
1906
1907    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1908        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1909    } else {
1910        stateTable=cnv->sharedData->mbcs.stateTable;
1911    }
1912
1913    /* sourceIndex=-1 if the current character began in the previous buffer */
1914    sourceIndex=0;
1915
1916    /* conversion loop */
1917    while(source<sourceLimit) {
1918        /*
1919         * This following test is to see if available input would overflow the output.
1920         * It does not catch output of more than one code unit that
1921         * overflows as a result of a surrogate pair or callback output
1922         * from the last source byte.
1923         * Therefore, those situations also test for overflows and will
1924         * then break the loop, too.
1925         */
1926        if(target>=targetLimit) {
1927            /* target is full */
1928            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1929            break;
1930        }
1931
1932        entry=stateTable[0][*source++];
1933        /* MBCS_ENTRY_IS_FINAL(entry) */
1934
1935        /* test the most common case first */
1936        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1937            /* output BMP code point */
1938            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1939            if(offsets!=NULL) {
1940                *offsets++=sourceIndex;
1941            }
1942
1943            /* normal end of action codes: prepare for a new character */
1944            ++sourceIndex;
1945            continue;
1946        }
1947
1948        /*
1949         * An if-else-if chain provides more reliable performance for
1950         * the most common cases compared to a switch.
1951         */
1952        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1953        if(action==MBCS_STATE_VALID_DIRECT_20 ||
1954           (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1955        ) {
1956            entry=MBCS_ENTRY_FINAL_VALUE(entry);
1957            /* output surrogate pair */
1958            *target++=(UChar)(0xd800|(UChar)(entry>>10));
1959            if(offsets!=NULL) {
1960                *offsets++=sourceIndex;
1961            }
1962            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1963            if(target<targetLimit) {
1964                *target++=c;
1965                if(offsets!=NULL) {
1966                    *offsets++=sourceIndex;
1967                }
1968            } else {
1969                /* target overflow */
1970                cnv->UCharErrorBuffer[0]=c;
1971                cnv->UCharErrorBufferLength=1;
1972                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1973                break;
1974            }
1975
1976            ++sourceIndex;
1977            continue;
1978        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1979            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1980                /* output BMP code point */
1981                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1982                if(offsets!=NULL) {
1983                    *offsets++=sourceIndex;
1984                }
1985
1986                ++sourceIndex;
1987                continue;
1988            }
1989        } else if(action==MBCS_STATE_UNASSIGNED) {
1990            /* just fall through */
1991        } else if(action==MBCS_STATE_ILLEGAL) {
1992            /* callback(illegal) */
1993            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1994        } else {
1995            /* reserved, must never occur */
1996            ++sourceIndex;
1997            continue;
1998        }
1999
2000        if(U_FAILURE(*pErrorCode)) {
2001            /* callback(illegal) */
2002            break;
2003        } else /* unassigned sequences indicated with byteIndex>0 */ {
2004            /* try an extension mapping */
2005            pArgs->source=(const char *)source;
2006            cnv->toUBytes[0]=*(source-1);
2007            cnv->toULength=_extToU(cnv, cnv->sharedData,
2008                                    1, &source, sourceLimit,
2009                                    &target, targetLimit,
2010                                    &offsets, sourceIndex,
2011                                    pArgs->flush,
2012                                    pErrorCode);
2013            sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
2014
2015            if(U_FAILURE(*pErrorCode)) {
2016                /* not mappable or buffer overflow */
2017                break;
2018            }
2019        }
2020    }
2021
2022    /* write back the updated pointers */
2023    pArgs->source=(const char *)source;
2024    pArgs->target=target;
2025    pArgs->offsets=offsets;
2026}
2027
2028/*
2029 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
2030 * that only map to and from the BMP.
2031 * In addition to single-byte optimizations, the offset calculations
2032 * become much easier.
2033 */
2034static void
2035ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
2036                            UErrorCode *pErrorCode) {
2037    UConverter *cnv;
2038    const uint8_t *source, *sourceLimit, *lastSource;
2039    UChar *target;
2040    int32_t targetCapacity, length;
2041    int32_t *offsets;
2042
2043    const int32_t (*stateTable)[256];
2044
2045    int32_t sourceIndex;
2046
2047    int32_t entry;
2048    uint8_t action;
2049
2050    /* set up the local pointers */
2051    cnv=pArgs->converter;
2052    source=(const uint8_t *)pArgs->source;
2053    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2054    target=pArgs->target;
2055    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
2056    offsets=pArgs->offsets;
2057
2058    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2059        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2060    } else {
2061        stateTable=cnv->sharedData->mbcs.stateTable;
2062    }
2063
2064    /* sourceIndex=-1 if the current character began in the previous buffer */
2065    sourceIndex=0;
2066    lastSource=source;
2067
2068    /*
2069     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2070     * for the minimum of the sourceLength and targetCapacity
2071     */
2072    length=(int32_t)(sourceLimit-source);
2073    if(length<targetCapacity) {
2074        targetCapacity=length;
2075    }
2076
2077#if MBCS_UNROLL_SINGLE_TO_BMP
2078    /* unrolling makes it faster on Pentium III/Windows 2000 */
2079    /* unroll the loop with the most common case */
2080unrolled:
2081    if(targetCapacity>=16) {
2082        int32_t count, loops, oredEntries;
2083
2084        loops=count=targetCapacity>>4;
2085        do {
2086            oredEntries=entry=stateTable[0][*source++];
2087            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2088            oredEntries|=entry=stateTable[0][*source++];
2089            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2090            oredEntries|=entry=stateTable[0][*source++];
2091            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2092            oredEntries|=entry=stateTable[0][*source++];
2093            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2094            oredEntries|=entry=stateTable[0][*source++];
2095            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2096            oredEntries|=entry=stateTable[0][*source++];
2097            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2098            oredEntries|=entry=stateTable[0][*source++];
2099            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2100            oredEntries|=entry=stateTable[0][*source++];
2101            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2102            oredEntries|=entry=stateTable[0][*source++];
2103            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2104            oredEntries|=entry=stateTable[0][*source++];
2105            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2106            oredEntries|=entry=stateTable[0][*source++];
2107            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2108            oredEntries|=entry=stateTable[0][*source++];
2109            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2110            oredEntries|=entry=stateTable[0][*source++];
2111            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2112            oredEntries|=entry=stateTable[0][*source++];
2113            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2114            oredEntries|=entry=stateTable[0][*source++];
2115            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2116            oredEntries|=entry=stateTable[0][*source++];
2117            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2118
2119            /* were all 16 entries really valid? */
2120            if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2121                /* no, return to the first of these 16 */
2122                source-=16;
2123                target-=16;
2124                break;
2125            }
2126        } while(--count>0);
2127        count=loops-count;
2128        targetCapacity-=16*count;
2129
2130        if(offsets!=NULL) {
2131            lastSource+=16*count;
2132            while(count>0) {
2133                *offsets++=sourceIndex++;
2134                *offsets++=sourceIndex++;
2135                *offsets++=sourceIndex++;
2136                *offsets++=sourceIndex++;
2137                *offsets++=sourceIndex++;
2138                *offsets++=sourceIndex++;
2139                *offsets++=sourceIndex++;
2140                *offsets++=sourceIndex++;
2141                *offsets++=sourceIndex++;
2142                *offsets++=sourceIndex++;
2143                *offsets++=sourceIndex++;
2144                *offsets++=sourceIndex++;
2145                *offsets++=sourceIndex++;
2146                *offsets++=sourceIndex++;
2147                *offsets++=sourceIndex++;
2148                *offsets++=sourceIndex++;
2149                --count;
2150            }
2151        }
2152    }
2153#endif
2154
2155    /* conversion loop */
2156    while(targetCapacity > 0 && source < sourceLimit) {
2157        entry=stateTable[0][*source++];
2158        /* MBCS_ENTRY_IS_FINAL(entry) */
2159
2160        /* test the most common case first */
2161        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2162            /* output BMP code point */
2163            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2164            --targetCapacity;
2165            continue;
2166        }
2167
2168        /*
2169         * An if-else-if chain provides more reliable performance for
2170         * the most common cases compared to a switch.
2171         */
2172        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2173        if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2174            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2175                /* output BMP code point */
2176                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2177                --targetCapacity;
2178                continue;
2179            }
2180        } else if(action==MBCS_STATE_UNASSIGNED) {
2181            /* just fall through */
2182        } else if(action==MBCS_STATE_ILLEGAL) {
2183            /* callback(illegal) */
2184            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2185        } else {
2186            /* reserved, must never occur */
2187            continue;
2188        }
2189
2190        /* set offsets since the start or the last extension */
2191        if(offsets!=NULL) {
2192            int32_t count=(int32_t)(source-lastSource);
2193
2194            /* predecrement: do not set the offset for the callback-causing character */
2195            while(--count>0) {
2196                *offsets++=sourceIndex++;
2197            }
2198            /* offset and sourceIndex are now set for the current character */
2199        }
2200
2201        if(U_FAILURE(*pErrorCode)) {
2202            /* callback(illegal) */
2203            break;
2204        } else /* unassigned sequences indicated with byteIndex>0 */ {
2205            /* try an extension mapping */
2206            lastSource=source;
2207            cnv->toUBytes[0]=*(source-1);
2208            cnv->toULength=_extToU(cnv, cnv->sharedData,
2209                                    1, &source, sourceLimit,
2210                                    &target, pArgs->targetLimit,
2211                                    &offsets, sourceIndex,
2212                                    pArgs->flush,
2213                                    pErrorCode);
2214            sourceIndex+=1+(int32_t)(source-lastSource);
2215
2216            if(U_FAILURE(*pErrorCode)) {
2217                /* not mappable or buffer overflow */
2218                break;
2219            }
2220
2221            /* recalculate the targetCapacity after an extension mapping */
2222            targetCapacity=(int32_t)(pArgs->targetLimit-target);
2223            length=(int32_t)(sourceLimit-source);
2224            if(length<targetCapacity) {
2225                targetCapacity=length;
2226            }
2227        }
2228
2229#if MBCS_UNROLL_SINGLE_TO_BMP
2230        /* unrolling makes it faster on Pentium III/Windows 2000 */
2231        goto unrolled;
2232#endif
2233    }
2234
2235    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2236        /* target is full */
2237        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2238    }
2239
2240    /* set offsets since the start or the last callback */
2241    if(offsets!=NULL) {
2242        size_t count=source-lastSource;
2243        while(count>0) {
2244            *offsets++=sourceIndex++;
2245            --count;
2246        }
2247    }
2248
2249    /* write back the updated pointers */
2250    pArgs->source=(const char *)source;
2251    pArgs->target=target;
2252    pArgs->offsets=offsets;
2253}
2254
2255static UBool
2256hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2257    const int32_t *row=stateTable[state];
2258    int32_t b, entry;
2259    /* First test for final entries in this state for some commonly valid byte values. */
2260    entry=row[0xa1];
2261    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2262        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2263    ) {
2264        return TRUE;
2265    }
2266    entry=row[0x41];
2267    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2268        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2269    ) {
2270        return TRUE;
2271    }
2272    /* Then test for final entries in this state. */
2273    for(b=0; b<=0xff; ++b) {
2274        entry=row[b];
2275        if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2276            MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2277        ) {
2278            return TRUE;
2279        }
2280    }
2281    /* Then recurse for transition entries. */
2282    for(b=0; b<=0xff; ++b) {
2283        entry=row[b];
2284        if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2285            hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2286        ) {
2287            return TRUE;
2288        }
2289    }
2290    return FALSE;
2291}
2292
2293/*
2294 * Is byte b a single/lead byte in this state?
2295 * Recurse for transition states, because here we don't want to say that
2296 * b is a lead byte if all byte sequences that start with b are illegal.
2297 */
2298static UBool
2299isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2300    const int32_t *row=stateTable[state];
2301    int32_t entry=row[b];
2302    if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
2303        return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2304    } else {
2305        uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2306        if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2307            return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
2308        } else {
2309            return action!=MBCS_STATE_ILLEGAL;
2310        }
2311    }
2312}
2313
2314U_CFUNC void
2315ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2316                          UErrorCode *pErrorCode) {
2317    UConverter *cnv;
2318    const uint8_t *source, *sourceLimit;
2319    UChar *target;
2320    const UChar *targetLimit;
2321    int32_t *offsets;
2322
2323    const int32_t (*stateTable)[256];
2324    const uint16_t *unicodeCodeUnits;
2325
2326    uint32_t offset;
2327    uint8_t state;
2328    int8_t byteIndex;
2329    uint8_t *bytes;
2330
2331    int32_t sourceIndex, nextSourceIndex;
2332
2333    int32_t entry;
2334    UChar c;
2335    uint8_t action;
2336
2337    /* use optimized function if possible */
2338    cnv=pArgs->converter;
2339
2340    if(cnv->preToULength>0) {
2341        /*
2342         * pass sourceIndex=-1 because we continue from an earlier buffer
2343         * in the future, this may change with continuous offsets
2344         */
2345        ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2346
2347        if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2348            return;
2349        }
2350    }
2351
2352    if(cnv->sharedData->mbcs.countStates==1) {
2353        if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2354            ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2355        } else {
2356            ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2357        }
2358        return;
2359    }
2360
2361    /* set up the local pointers */
2362    source=(const uint8_t *)pArgs->source;
2363    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2364    target=pArgs->target;
2365    targetLimit=pArgs->targetLimit;
2366    offsets=pArgs->offsets;
2367
2368    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2369        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2370    } else {
2371        stateTable=cnv->sharedData->mbcs.stateTable;
2372    }
2373    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2374
2375    /* get the converter state from UConverter */
2376    offset=cnv->toUnicodeStatus;
2377    byteIndex=cnv->toULength;
2378    bytes=cnv->toUBytes;
2379
2380    /*
2381     * if we are in the SBCS state for a DBCS-only converter,
2382     * then load the DBCS state from the MBCS data
2383     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2384     */
2385    if((state=(uint8_t)(cnv->mode))==0) {
2386        state=cnv->sharedData->mbcs.dbcsOnlyState;
2387    }
2388
2389    /* sourceIndex=-1 if the current character began in the previous buffer */
2390    sourceIndex=byteIndex==0 ? 0 : -1;
2391    nextSourceIndex=0;
2392
2393    /* conversion loop */
2394    while(source<sourceLimit) {
2395        /*
2396         * This following test is to see if available input would overflow the output.
2397         * It does not catch output of more than one code unit that
2398         * overflows as a result of a surrogate pair or callback output
2399         * from the last source byte.
2400         * Therefore, those situations also test for overflows and will
2401         * then break the loop, too.
2402         */
2403        if(target>=targetLimit) {
2404            /* target is full */
2405            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2406            break;
2407        }
2408
2409        if(byteIndex==0) {
2410            /* optimized loop for 1/2-byte input and BMP output */
2411            if(offsets==NULL) {
2412                do {
2413                    entry=stateTable[state][*source];
2414                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2415                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2416                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2417
2418                        ++source;
2419                        if( source<sourceLimit &&
2420                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2421                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2422                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2423                        ) {
2424                            ++source;
2425                            *target++=c;
2426                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2427                            offset=0;
2428                        } else {
2429                            /* set the state and leave the optimized loop */
2430                            bytes[0]=*(source-1);
2431                            byteIndex=1;
2432                            break;
2433                        }
2434                    } else {
2435                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2436                            /* output BMP code point */
2437                            ++source;
2438                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2439                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2440                        } else {
2441                            /* leave the optimized loop */
2442                            break;
2443                        }
2444                    }
2445                } while(source<sourceLimit && target<targetLimit);
2446            } else /* offsets!=NULL */ {
2447                do {
2448                    entry=stateTable[state][*source];
2449                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2450                        state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2451                        offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2452
2453                        ++source;
2454                        if( source<sourceLimit &&
2455                            MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2456                            MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2457                            (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2458                        ) {
2459                            ++source;
2460                            *target++=c;
2461                            if(offsets!=NULL) {
2462                                *offsets++=sourceIndex;
2463                                sourceIndex=(nextSourceIndex+=2);
2464                            }
2465                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2466                            offset=0;
2467                        } else {
2468                            /* set the state and leave the optimized loop */
2469                            ++nextSourceIndex;
2470                            bytes[0]=*(source-1);
2471                            byteIndex=1;
2472                            break;
2473                        }
2474                    } else {
2475                        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2476                            /* output BMP code point */
2477                            ++source;
2478                            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2479                            if(offsets!=NULL) {
2480                                *offsets++=sourceIndex;
2481                                sourceIndex=++nextSourceIndex;
2482                            }
2483                            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2484                        } else {
2485                            /* leave the optimized loop */
2486                            break;
2487                        }
2488                    }
2489                } while(source<sourceLimit && target<targetLimit);
2490            }
2491
2492            /*
2493             * these tests and break statements could be put inside the loop
2494             * if C had "break outerLoop" like Java
2495             */
2496            if(source>=sourceLimit) {
2497                break;
2498            }
2499            if(target>=targetLimit) {
2500                /* target is full */
2501                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2502                break;
2503            }
2504
2505            ++nextSourceIndex;
2506            bytes[byteIndex++]=*source++;
2507        } else /* byteIndex>0 */ {
2508            ++nextSourceIndex;
2509            entry=stateTable[state][bytes[byteIndex++]=*source++];
2510        }
2511
2512        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2513            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2514            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2515            continue;
2516        }
2517
2518        /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2519        cnv->mode=state;
2520
2521        /* set the next state early so that we can reuse the entry variable */
2522        state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2523
2524        /*
2525         * An if-else-if chain provides more reliable performance for
2526         * the most common cases compared to a switch.
2527         */
2528        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2529        if(action==MBCS_STATE_VALID_16) {
2530            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2531            c=unicodeCodeUnits[offset];
2532            if(c<0xfffe) {
2533                /* output BMP code point */
2534                *target++=c;
2535                if(offsets!=NULL) {
2536                    *offsets++=sourceIndex;
2537                }
2538                byteIndex=0;
2539            } else if(c==0xfffe) {
2540                if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2541                    /* output fallback BMP code point */
2542                    *target++=(UChar)entry;
2543                    if(offsets!=NULL) {
2544                        *offsets++=sourceIndex;
2545                    }
2546                    byteIndex=0;
2547                }
2548            } else {
2549                /* callback(illegal) */
2550                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2551            }
2552        } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2553            /* output BMP code point */
2554            *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2555            if(offsets!=NULL) {
2556                *offsets++=sourceIndex;
2557            }
2558            byteIndex=0;
2559        } else if(action==MBCS_STATE_VALID_16_PAIR) {
2560            offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2561            c=unicodeCodeUnits[offset++];
2562            if(c<0xd800) {
2563                /* output BMP code point below 0xd800 */
2564                *target++=c;
2565                if(offsets!=NULL) {
2566                    *offsets++=sourceIndex;
2567                }
2568                byteIndex=0;
2569            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2570                /* output roundtrip or fallback surrogate pair */
2571                *target++=(UChar)(c&0xdbff);
2572                if(offsets!=NULL) {
2573                    *offsets++=sourceIndex;
2574                }
2575                byteIndex=0;
2576                if(target<targetLimit) {
2577                    *target++=unicodeCodeUnits[offset];
2578                    if(offsets!=NULL) {
2579                        *offsets++=sourceIndex;
2580                    }
2581                } else {
2582                    /* target overflow */
2583                    cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2584                    cnv->UCharErrorBufferLength=1;
2585                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2586
2587                    offset=0;
2588                    break;
2589                }
2590            } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2591                /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2592                *target++=unicodeCodeUnits[offset];
2593                if(offsets!=NULL) {
2594                    *offsets++=sourceIndex;
2595                }
2596                byteIndex=0;
2597            } else if(c==0xffff) {
2598                /* callback(illegal) */
2599                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2600            }
2601        } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2602                  (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2603        ) {
2604            entry=MBCS_ENTRY_FINAL_VALUE(entry);
2605            /* output surrogate pair */
2606            *target++=(UChar)(0xd800|(UChar)(entry>>10));
2607            if(offsets!=NULL) {
2608                *offsets++=sourceIndex;
2609            }
2610            byteIndex=0;
2611            c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2612            if(target<targetLimit) {
2613                *target++=c;
2614                if(offsets!=NULL) {
2615                    *offsets++=sourceIndex;
2616                }
2617            } else {
2618                /* target overflow */
2619                cnv->UCharErrorBuffer[0]=c;
2620                cnv->UCharErrorBufferLength=1;
2621                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2622
2623                offset=0;
2624                break;
2625            }
2626        } else if(action==MBCS_STATE_CHANGE_ONLY) {
2627            /*
2628             * This serves as a state change without any output.
2629             * It is useful for reading simple stateful encodings,
2630             * for example using just Shift-In/Shift-Out codes.
2631             * The 21 unused bits may later be used for more sophisticated
2632             * state transitions.
2633             */
2634            if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2635                byteIndex=0;
2636            } else {
2637                /* SI/SO are illegal for DBCS-only conversion */
2638                state=(uint8_t)(cnv->mode); /* restore the previous state */
2639
2640                /* callback(illegal) */
2641                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2642            }
2643        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2644            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2645                /* output BMP code point */
2646                *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2647                if(offsets!=NULL) {
2648                    *offsets++=sourceIndex;
2649                }
2650                byteIndex=0;
2651            }
2652        } else if(action==MBCS_STATE_UNASSIGNED) {
2653            /* just fall through */
2654        } else if(action==MBCS_STATE_ILLEGAL) {
2655            /* callback(illegal) */
2656            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2657        } else {
2658            /* reserved, must never occur */
2659            byteIndex=0;
2660        }
2661
2662        /* end of action codes: prepare for a new character */
2663        offset=0;
2664
2665        if(byteIndex==0) {
2666            sourceIndex=nextSourceIndex;
2667        } else if(U_FAILURE(*pErrorCode)) {
2668            /* callback(illegal) */
2669            if(byteIndex>1) {
2670                /*
2671                 * Ticket 5691: consistent illegal sequences:
2672                 * - We include at least the first byte in the illegal sequence.
2673                 * - If any of the non-initial bytes could be the start of a character,
2674                 *   we stop the illegal sequence before the first one of those.
2675                 */
2676                UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2677                int8_t i;
2678                for(i=1;
2679                    i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2680                    ++i) {}
2681                if(i<byteIndex) {
2682                    /* Back out some bytes. */
2683                    int8_t backOutDistance=byteIndex-i;
2684                    int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2685                    byteIndex=i;  /* length of reported illegal byte sequence */
2686                    if(backOutDistance<=bytesFromThisBuffer) {
2687                        source-=backOutDistance;
2688                    } else {
2689                        /* Back out bytes from the previous buffer: Need to replay them. */
2690                        cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2691                        /* preToULength is negative! */
2692                        uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2693                        source=(const uint8_t *)pArgs->source;
2694                    }
2695                }
2696            }
2697            break;
2698        } else /* unassigned sequences indicated with byteIndex>0 */ {
2699            /* try an extension mapping */
2700            pArgs->source=(const char *)source;
2701            byteIndex=_extToU(cnv, cnv->sharedData,
2702                              byteIndex, &source, sourceLimit,
2703                              &target, targetLimit,
2704                              &offsets, sourceIndex,
2705                              pArgs->flush,
2706                              pErrorCode);
2707            sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
2708
2709            if(U_FAILURE(*pErrorCode)) {
2710                /* not mappable or buffer overflow */
2711                break;
2712            }
2713        }
2714    }
2715
2716    /* set the converter state back into UConverter */
2717    cnv->toUnicodeStatus=offset;
2718    cnv->mode=state;
2719    cnv->toULength=byteIndex;
2720
2721    /* write back the updated pointers */
2722    pArgs->source=(const char *)source;
2723    pArgs->target=target;
2724    pArgs->offsets=offsets;
2725}
2726
2727/*
2728 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2729 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2730 */
2731static UChar32
2732ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2733                        UErrorCode *pErrorCode) {
2734    UConverter *cnv;
2735    const int32_t (*stateTable)[256];
2736    const uint8_t *source, *sourceLimit;
2737
2738    int32_t entry;
2739    uint8_t action;
2740
2741    /* set up the local pointers */
2742    cnv=pArgs->converter;
2743    source=(const uint8_t *)pArgs->source;
2744    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2745    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2746        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2747    } else {
2748        stateTable=cnv->sharedData->mbcs.stateTable;
2749    }
2750
2751    /* conversion loop */
2752    while(source<sourceLimit) {
2753        entry=stateTable[0][*source++];
2754        /* MBCS_ENTRY_IS_FINAL(entry) */
2755
2756        /* write back the updated pointer early so that we can return directly */
2757        pArgs->source=(const char *)source;
2758
2759        if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2760            /* output BMP code point */
2761            return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2762        }
2763
2764        /*
2765         * An if-else-if chain provides more reliable performance for
2766         * the most common cases compared to a switch.
2767         */
2768        action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2769        if( action==MBCS_STATE_VALID_DIRECT_20 ||
2770            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2771        ) {
2772            /* output supplementary code point */
2773            return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2774        } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2775            if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2776                /* output BMP code point */
2777                return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2778            }
2779        } else if(action==MBCS_STATE_UNASSIGNED) {
2780            /* just fall through */
2781        } else if(action==MBCS_STATE_ILLEGAL) {
2782            /* callback(illegal) */
2783            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2784        } else {
2785            /* reserved, must never occur */
2786            continue;
2787        }
2788
2789        if(U_FAILURE(*pErrorCode)) {
2790            /* callback(illegal) */
2791            break;
2792        } else /* unassigned sequence */ {
2793            /* defer to the generic implementation */
2794            pArgs->source=(const char *)source-1;
2795            return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2796        }
2797    }
2798
2799    /* no output because of empty input or only state changes */
2800    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2801    return 0xffff;
2802}
2803
2804/*
2805 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2806 * conversion without offset handling.
2807 *
2808 * When a character does not have a mapping to Unicode, then we return to the
2809 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2810 * handling.
2811 * We also defer to the generic code in other complicated cases and have them
2812 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2813 *
2814 * All normal mappings and errors are handled here.
2815 */
2816static UChar32
2817ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2818                  UErrorCode *pErrorCode) {
2819    UConverter *cnv;
2820    const uint8_t *source, *sourceLimit, *lastSource;
2821
2822    const int32_t (*stateTable)[256];
2823    const uint16_t *unicodeCodeUnits;
2824
2825    uint32_t offset;
2826    uint8_t state;
2827
2828    int32_t entry;
2829    UChar32 c;
2830    uint8_t action;
2831
2832    /* use optimized function if possible */
2833    cnv=pArgs->converter;
2834
2835    if(cnv->preToULength>0) {
2836        /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2837        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2838    }
2839
2840    if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2841        /*
2842         * Using the generic ucnv_getNextUChar() code lets us deal correctly
2843         * with the rare case of a codepage that maps single surrogates
2844         * without adding the complexity to this already complicated function here.
2845         */
2846        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2847    } else if(cnv->sharedData->mbcs.countStates==1) {
2848        return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2849    }
2850
2851    /* set up the local pointers */
2852    source=lastSource=(const uint8_t *)pArgs->source;
2853    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2854
2855    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2856        stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2857    } else {
2858        stateTable=cnv->sharedData->mbcs.stateTable;
2859    }
2860    unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2861
2862    /* get the converter state from UConverter */
2863    offset=cnv->toUnicodeStatus;
2864
2865    /*
2866     * if we are in the SBCS state for a DBCS-only converter,
2867     * then load the DBCS state from the MBCS data
2868     * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2869     */
2870    if((state=(uint8_t)(cnv->mode))==0) {
2871        state=cnv->sharedData->mbcs.dbcsOnlyState;
2872    }
2873
2874    /* conversion loop */
2875    c=U_SENTINEL;
2876    while(source<sourceLimit) {
2877        entry=stateTable[state][*source++];
2878        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2879            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2880            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2881
2882            /* optimization for 1/2-byte input and BMP output */
2883            if( source<sourceLimit &&
2884                MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2885                MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2886                (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2887            ) {
2888                ++source;
2889                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2890                /* output BMP code point */
2891                break;
2892            }
2893        } else {
2894            /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2895            cnv->mode=state;
2896
2897            /* set the next state early so that we can reuse the entry variable */
2898            state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2899
2900            /*
2901             * An if-else-if chain provides more reliable performance for
2902             * the most common cases compared to a switch.
2903             */
2904            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2905            if(action==MBCS_STATE_VALID_DIRECT_16) {
2906                /* output BMP code point */
2907                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2908                break;
2909            } else if(action==MBCS_STATE_VALID_16) {
2910                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2911                c=unicodeCodeUnits[offset];
2912                if(c<0xfffe) {
2913                    /* output BMP code point */
2914                    break;
2915                } else if(c==0xfffe) {
2916                    if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2917                        break;
2918                    }
2919                } else {
2920                    /* callback(illegal) */
2921                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2922                }
2923            } else if(action==MBCS_STATE_VALID_16_PAIR) {
2924                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2925                c=unicodeCodeUnits[offset++];
2926                if(c<0xd800) {
2927                    /* output BMP code point below 0xd800 */
2928                    break;
2929                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2930                    /* output roundtrip or fallback supplementary code point */
2931                    c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2932                    break;
2933                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2934                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2935                    c=unicodeCodeUnits[offset];
2936                    break;
2937                } else if(c==0xffff) {
2938                    /* callback(illegal) */
2939                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2940                }
2941            } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2942                      (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2943            ) {
2944                /* output supplementary code point */
2945                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2946                break;
2947            } else if(action==MBCS_STATE_CHANGE_ONLY) {
2948                /*
2949                 * This serves as a state change without any output.
2950                 * It is useful for reading simple stateful encodings,
2951                 * for example using just Shift-In/Shift-Out codes.
2952                 * The 21 unused bits may later be used for more sophisticated
2953                 * state transitions.
2954                 */
2955                if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2956                    /* SI/SO are illegal for DBCS-only conversion */
2957                    state=(uint8_t)(cnv->mode); /* restore the previous state */
2958
2959                    /* callback(illegal) */
2960                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2961                }
2962            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2963                if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2964                    /* output BMP code point */
2965                    c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2966                    break;
2967                }
2968            } else if(action==MBCS_STATE_UNASSIGNED) {
2969                /* just fall through */
2970            } else if(action==MBCS_STATE_ILLEGAL) {
2971                /* callback(illegal) */
2972                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2973            } else {
2974                /* reserved (must never occur), or only state change */
2975                offset=0;
2976                lastSource=source;
2977                continue;
2978            }
2979
2980            /* end of action codes: prepare for a new character */
2981            offset=0;
2982
2983            if(U_FAILURE(*pErrorCode)) {
2984                /* callback(illegal) */
2985                break;
2986            } else /* unassigned sequence */ {
2987                /* defer to the generic implementation */
2988                cnv->toUnicodeStatus=0;
2989                cnv->mode=state;
2990                pArgs->source=(const char *)lastSource;
2991                return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2992            }
2993        }
2994    }
2995
2996    if(c<0) {
2997        if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
2998            /* incomplete character byte sequence */
2999            uint8_t *bytes=cnv->toUBytes;
3000            cnv->toULength=(int8_t)(source-lastSource);
3001            do {
3002                *bytes++=*lastSource++;
3003            } while(lastSource<source);
3004            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3005        } else if(U_FAILURE(*pErrorCode)) {
3006            /* callback(illegal) */
3007            /*
3008             * Ticket 5691: consistent illegal sequences:
3009             * - We include at least the first byte in the illegal sequence.
3010             * - If any of the non-initial bytes could be the start of a character,
3011             *   we stop the illegal sequence before the first one of those.
3012             */
3013            UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
3014            uint8_t *bytes=cnv->toUBytes;
3015            *bytes++=*lastSource++;     /* first byte */
3016            if(lastSource==source) {
3017                cnv->toULength=1;
3018            } else /* lastSource<source: multi-byte character */ {
3019                int8_t i;
3020                for(i=1;
3021                    lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
3022                    ++i
3023                ) {
3024                    *bytes++=*lastSource++;
3025                }
3026                cnv->toULength=i;
3027                source=lastSource;
3028            }
3029        } else {
3030            /* no output because of empty input or only state changes */
3031            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
3032        }
3033        c=0xffff;
3034    }
3035
3036    /* set the converter state back into UConverter, ready for a new character */
3037    cnv->toUnicodeStatus=0;
3038    cnv->mode=state;
3039
3040    /* write back the updated pointer */
3041    pArgs->source=(const char *)source;
3042    return c;
3043}
3044
3045#if 0
3046/*
3047 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3048 * Removal improves code coverage.
3049 */
3050/**
3051 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
3052 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3053 * It does not handle conversion extensions (_extToU()).
3054 */
3055U_CFUNC UChar32
3056ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
3057                              uint8_t b, UBool useFallback) {
3058    int32_t entry;
3059    uint8_t action;
3060
3061    entry=sharedData->mbcs.stateTable[0][b];
3062    /* MBCS_ENTRY_IS_FINAL(entry) */
3063
3064    if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
3065        /* output BMP code point */
3066        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3067    }
3068
3069    /*
3070     * An if-else-if chain provides more reliable performance for
3071     * the most common cases compared to a switch.
3072     */
3073    action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3074    if(action==MBCS_STATE_VALID_DIRECT_20) {
3075        /* output supplementary code point */
3076        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3077    } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3078        if(!TO_U_USE_FALLBACK(useFallback)) {
3079            return 0xfffe;
3080        }
3081        /* output BMP code point */
3082        return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3083    } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3084        if(!TO_U_USE_FALLBACK(useFallback)) {
3085            return 0xfffe;
3086        }
3087        /* output supplementary code point */
3088        return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3089    } else if(action==MBCS_STATE_UNASSIGNED) {
3090        return 0xfffe;
3091    } else if(action==MBCS_STATE_ILLEGAL) {
3092        return 0xffff;
3093    } else {
3094        /* reserved, must never occur */
3095        return 0xffff;
3096    }
3097}
3098#endif
3099
3100/*
3101 * This is a simple version of _MBCSGetNextUChar() that is used
3102 * by other converter implementations.
3103 * It only returns an "assigned" result if it consumes the entire input.
3104 * It does not use state from the converter, nor error codes.
3105 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3106 * It handles conversion extensions but not GB 18030.
3107 *
3108 * Return value:
3109 * U+fffe   unassigned
3110 * U+ffff   illegal
3111 * otherwise the Unicode code point
3112 */
3113U_CFUNC UChar32
3114ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3115                        const char *source, int32_t length,
3116                        UBool useFallback) {
3117    const int32_t (*stateTable)[256];
3118    const uint16_t *unicodeCodeUnits;
3119
3120    uint32_t offset;
3121    uint8_t state, action;
3122
3123    UChar32 c;
3124    int32_t i, entry;
3125
3126    if(length<=0) {
3127        /* no input at all: "illegal" */
3128        return 0xffff;
3129    }
3130
3131#if 0
3132/*
3133 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3134 * TODO In future releases, verify that this function is never called for SBCS
3135 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3136 * Removal improves code coverage.
3137 */
3138    /* use optimized function if possible */
3139    if(sharedData->mbcs.countStates==1) {
3140        if(length==1) {
3141            return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3142        } else {
3143            return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3144        }
3145    }
3146#endif
3147
3148    /* set up the local pointers */
3149    stateTable=sharedData->mbcs.stateTable;
3150    unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3151
3152    /* converter state */
3153    offset=0;
3154    state=sharedData->mbcs.dbcsOnlyState;
3155
3156    /* conversion loop */
3157    for(i=0;;) {
3158        entry=stateTable[state][(uint8_t)source[i++]];
3159        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3160            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3161            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3162
3163            if(i==length) {
3164                return 0xffff; /* truncated character */
3165            }
3166        } else {
3167            /*
3168             * An if-else-if chain provides more reliable performance for
3169             * the most common cases compared to a switch.
3170             */
3171            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3172            if(action==MBCS_STATE_VALID_16) {
3173                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3174                c=unicodeCodeUnits[offset];
3175                if(c!=0xfffe) {
3176                    /* done */
3177                } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3178                    c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3179                /* else done with 0xfffe */
3180                }
3181                break;
3182            } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3183                /* output BMP code point */
3184                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3185                break;
3186            } else if(action==MBCS_STATE_VALID_16_PAIR) {
3187                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3188                c=unicodeCodeUnits[offset++];
3189                if(c<0xd800) {
3190                    /* output BMP code point below 0xd800 */
3191                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3192                    /* output roundtrip or fallback supplementary code point */
3193                    c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3194                } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3195                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3196                    c=unicodeCodeUnits[offset];
3197                } else if(c==0xffff) {
3198                    return 0xffff;
3199                } else {
3200                    c=0xfffe;
3201                }
3202                break;
3203            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3204                /* output supplementary code point */
3205                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3206                break;
3207            } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3208                if(!TO_U_USE_FALLBACK(useFallback)) {
3209                    c=0xfffe;
3210                    break;
3211                }
3212                /* output BMP code point */
3213                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3214                break;
3215            } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3216                if(!TO_U_USE_FALLBACK(useFallback)) {
3217                    c=0xfffe;
3218                    break;
3219                }
3220                /* output supplementary code point */
3221                c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3222                break;
3223            } else if(action==MBCS_STATE_UNASSIGNED) {
3224                c=0xfffe;
3225                break;
3226            }
3227
3228            /*
3229             * forbid MBCS_STATE_CHANGE_ONLY for this function,
3230             * and MBCS_STATE_ILLEGAL and reserved action codes
3231             */
3232            return 0xffff;
3233        }
3234    }
3235
3236    if(i!=length) {
3237        /* illegal for this function: not all input consumed */
3238        return 0xffff;
3239    }
3240
3241    if(c==0xfffe) {
3242        /* try an extension mapping */
3243        const int32_t *cx=sharedData->mbcs.extIndexes;
3244        if(cx!=NULL) {
3245            return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3246        }
3247    }
3248
3249    return c;
3250}
3251
3252/* MBCS-from-Unicode conversion functions ----------------------------------- */
3253
3254/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3255static void
3256ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3257                                  UErrorCode *pErrorCode) {
3258    UConverter *cnv;
3259    const UChar *source, *sourceLimit;
3260    uint8_t *target;
3261    int32_t targetCapacity;
3262    int32_t *offsets;
3263
3264    const uint16_t *table;
3265    const uint16_t *mbcsIndex;
3266    const uint8_t *bytes;
3267
3268    UChar32 c;
3269
3270    int32_t sourceIndex, nextSourceIndex;
3271
3272    uint32_t stage2Entry;
3273    uint32_t asciiRoundtrips;
3274    uint32_t value;
3275    uint8_t unicodeMask;
3276
3277    /* use optimized function if possible */
3278    cnv=pArgs->converter;
3279    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3280
3281    /* set up the local pointers */
3282    source=pArgs->source;
3283    sourceLimit=pArgs->sourceLimit;
3284    target=(uint8_t *)pArgs->target;
3285    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3286    offsets=pArgs->offsets;
3287
3288    table=cnv->sharedData->mbcs.fromUnicodeTable;
3289    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3290    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3291        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3292    } else {
3293        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3294    }
3295    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3296
3297    /* get the converter state from UConverter */
3298    c=cnv->fromUChar32;
3299
3300    /* sourceIndex=-1 if the current character began in the previous buffer */
3301    sourceIndex= c==0 ? 0 : -1;
3302    nextSourceIndex=0;
3303
3304    /* conversion loop */
3305    if(c!=0 && targetCapacity>0) {
3306        goto getTrail;
3307    }
3308
3309    while(source<sourceLimit) {
3310        /*
3311         * This following test is to see if available input would overflow the output.
3312         * It does not catch output of more than one byte that
3313         * overflows as a result of a multi-byte character or callback output
3314         * from the last source character.
3315         * Therefore, those situations also test for overflows and will
3316         * then break the loop, too.
3317         */
3318        if(targetCapacity>0) {
3319            /*
3320             * Get a correct Unicode code point:
3321             * a single UChar for a BMP code point or
3322             * a matched surrogate pair for a "supplementary code point".
3323             */
3324            c=*source++;
3325            ++nextSourceIndex;
3326            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3327                *target++=(uint8_t)c;
3328                if(offsets!=NULL) {
3329                    *offsets++=sourceIndex;
3330                    sourceIndex=nextSourceIndex;
3331                }
3332                --targetCapacity;
3333                c=0;
3334                continue;
3335            }
3336            /*
3337             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3338             * to avoid dealing with surrogates.
3339             * MBCS_FAST_MAX must be >=0xd7ff.
3340             */
3341            if(c<=0xd7ff) {
3342                value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3343                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3344                if(value==0) {
3345                    goto unassigned;
3346                }
3347                /* output the value */
3348            } else {
3349                /*
3350                 * This also tests if the codepage maps single surrogates.
3351                 * If it does, then surrogates are not paired but mapped separately.
3352                 * Note that in this case unmatched surrogates are not detected.
3353                 */
3354                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3355                    if(UTF_IS_SURROGATE_FIRST(c)) {
3356getTrail:
3357                        if(source<sourceLimit) {
3358                            /* test the following code unit */
3359                            UChar trail=*source;
3360                            if(UTF_IS_SECOND_SURROGATE(trail)) {
3361                                ++source;
3362                                ++nextSourceIndex;
3363                                c=UTF16_GET_PAIR_VALUE(c, trail);
3364                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3365                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3366                                    /* callback(unassigned) */
3367                                    goto unassigned;
3368                                }
3369                                /* convert this supplementary code point */
3370                                /* exit this condition tree */
3371                            } else {
3372                                /* this is an unmatched lead code unit (1st surrogate) */
3373                                /* callback(illegal) */
3374                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3375                                break;
3376                            }
3377                        } else {
3378                            /* no more input */
3379                            break;
3380                        }
3381                    } else {
3382                        /* this is an unmatched trail code unit (2nd surrogate) */
3383                        /* callback(illegal) */
3384                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3385                        break;
3386                    }
3387                }
3388
3389                /* convert the Unicode code point in c into codepage bytes */
3390                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3391
3392                /* get the bytes and the length for the output */
3393                /* MBCS_OUTPUT_2 */
3394                value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3395
3396                /* is this code point assigned, or do we use fallbacks? */
3397                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3398                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3399                ) {
3400                    /*
3401                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
3402                     * There is no way with this data structure for fallback output
3403                     * to be a zero byte.
3404                     */
3405
3406unassigned:
3407                    /* try an extension mapping */
3408                    pArgs->source=source;
3409                    c=_extFromU(cnv, cnv->sharedData,
3410                                c, &source, sourceLimit,
3411                                &target, target+targetCapacity,
3412                                &offsets, sourceIndex,
3413                                pArgs->flush,
3414                                pErrorCode);
3415                    nextSourceIndex+=(int32_t)(source-pArgs->source);
3416
3417                    if(U_FAILURE(*pErrorCode)) {
3418                        /* not mappable or buffer overflow */
3419                        break;
3420                    } else {
3421                        /* a mapping was written to the target, continue */
3422
3423                        /* recalculate the targetCapacity after an extension mapping */
3424                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3425
3426                        /* normal end of conversion: prepare for a new character */
3427                        sourceIndex=nextSourceIndex;
3428                        continue;
3429                    }
3430                }
3431            }
3432
3433            /* write the output character bytes from value and length */
3434            /* from the first if in the loop we know that targetCapacity>0 */
3435            if(value<=0xff) {
3436                /* this is easy because we know that there is enough space */
3437                *target++=(uint8_t)value;
3438                if(offsets!=NULL) {
3439                    *offsets++=sourceIndex;
3440                }
3441                --targetCapacity;
3442            } else /* length==2 */ {
3443                *target++=(uint8_t)(value>>8);
3444                if(2<=targetCapacity) {
3445                    *target++=(uint8_t)value;
3446                    if(offsets!=NULL) {
3447                        *offsets++=sourceIndex;
3448                        *offsets++=sourceIndex;
3449                    }
3450                    targetCapacity-=2;
3451                } else {
3452                    if(offsets!=NULL) {
3453                        *offsets++=sourceIndex;
3454                    }
3455                    cnv->charErrorBuffer[0]=(char)value;
3456                    cnv->charErrorBufferLength=1;
3457
3458                    /* target overflow */
3459                    targetCapacity=0;
3460                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3461                    c=0;
3462                    break;
3463                }
3464            }
3465
3466            /* normal end of conversion: prepare for a new character */
3467            c=0;
3468            sourceIndex=nextSourceIndex;
3469            continue;
3470        } else {
3471            /* target is full */
3472            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3473            break;
3474        }
3475    }
3476
3477    /* set the converter state back into UConverter */
3478    cnv->fromUChar32=c;
3479
3480    /* write back the updated pointers */
3481    pArgs->source=source;
3482    pArgs->target=(char *)target;
3483    pArgs->offsets=offsets;
3484}
3485
3486/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3487static void
3488ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3489                                  UErrorCode *pErrorCode) {
3490    UConverter *cnv;
3491    const UChar *source, *sourceLimit;
3492    uint8_t *target;
3493    int32_t targetCapacity;
3494    int32_t *offsets;
3495
3496    const uint16_t *table;
3497    const uint16_t *results;
3498
3499    UChar32 c;
3500
3501    int32_t sourceIndex, nextSourceIndex;
3502
3503    uint16_t value, minValue;
3504    UBool hasSupplementary;
3505
3506    /* set up the local pointers */
3507    cnv=pArgs->converter;
3508    source=pArgs->source;
3509    sourceLimit=pArgs->sourceLimit;
3510    target=(uint8_t *)pArgs->target;
3511    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3512    offsets=pArgs->offsets;
3513
3514    table=cnv->sharedData->mbcs.fromUnicodeTable;
3515    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3516        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3517    } else {
3518        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3519    }
3520
3521    if(cnv->useFallback) {
3522        /* use all roundtrip and fallback results */
3523        minValue=0x800;
3524    } else {
3525        /* use only roundtrips and fallbacks from private-use characters */
3526        minValue=0xc00;
3527    }
3528    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
3529
3530    /* get the converter state from UConverter */
3531    c=cnv->fromUChar32;
3532
3533    /* sourceIndex=-1 if the current character began in the previous buffer */
3534    sourceIndex= c==0 ? 0 : -1;
3535    nextSourceIndex=0;
3536
3537    /* conversion loop */
3538    if(c!=0 && targetCapacity>0) {
3539        goto getTrail;
3540    }
3541
3542    while(source<sourceLimit) {
3543        /*
3544         * This following test is to see if available input would overflow the output.
3545         * It does not catch output of more than one byte that
3546         * overflows as a result of a multi-byte character or callback output
3547         * from the last source character.
3548         * Therefore, those situations also test for overflows and will
3549         * then break the loop, too.
3550         */
3551        if(targetCapacity>0) {
3552            /*
3553             * Get a correct Unicode code point:
3554             * a single UChar for a BMP code point or
3555             * a matched surrogate pair for a "supplementary code point".
3556             */
3557            c=*source++;
3558            ++nextSourceIndex;
3559            if(UTF_IS_SURROGATE(c)) {
3560                if(UTF_IS_SURROGATE_FIRST(c)) {
3561getTrail:
3562                    if(source<sourceLimit) {
3563                        /* test the following code unit */
3564                        UChar trail=*source;
3565                        if(UTF_IS_SECOND_SURROGATE(trail)) {
3566                            ++source;
3567                            ++nextSourceIndex;
3568                            c=UTF16_GET_PAIR_VALUE(c, trail);
3569                            if(!hasSupplementary) {
3570                                /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3571                                /* callback(unassigned) */
3572                                goto unassigned;
3573                            }
3574                            /* convert this supplementary code point */
3575                            /* exit this condition tree */
3576                        } else {
3577                            /* this is an unmatched lead code unit (1st surrogate) */
3578                            /* callback(illegal) */
3579                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3580                            break;
3581                        }
3582                    } else {
3583                        /* no more input */
3584                        break;
3585                    }
3586                } else {
3587                    /* this is an unmatched trail code unit (2nd surrogate) */
3588                    /* callback(illegal) */
3589                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3590                    break;
3591                }
3592            }
3593
3594            /* convert the Unicode code point in c into codepage bytes */
3595            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3596
3597            /* is this code point assigned, or do we use fallbacks? */
3598            if(value>=minValue) {
3599                /* assigned, write the output character bytes from value and length */
3600                /* length==1 */
3601                /* this is easy because we know that there is enough space */
3602                *target++=(uint8_t)value;
3603                if(offsets!=NULL) {
3604                    *offsets++=sourceIndex;
3605                }
3606                --targetCapacity;
3607
3608                /* normal end of conversion: prepare for a new character */
3609                c=0;
3610                sourceIndex=nextSourceIndex;
3611            } else { /* unassigned */
3612unassigned:
3613                /* try an extension mapping */
3614                pArgs->source=source;
3615                c=_extFromU(cnv, cnv->sharedData,
3616                            c, &source, sourceLimit,
3617                            &target, target+targetCapacity,
3618                            &offsets, sourceIndex,
3619                            pArgs->flush,
3620                            pErrorCode);
3621                nextSourceIndex+=(int32_t)(source-pArgs->source);
3622
3623                if(U_FAILURE(*pErrorCode)) {
3624                    /* not mappable or buffer overflow */
3625                    break;
3626                } else {
3627                    /* a mapping was written to the target, continue */
3628
3629                    /* recalculate the targetCapacity after an extension mapping */
3630                    targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3631
3632                    /* normal end of conversion: prepare for a new character */
3633                    sourceIndex=nextSourceIndex;
3634                }
3635            }
3636        } else {
3637            /* target is full */
3638            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3639            break;
3640        }
3641    }
3642
3643    /* set the converter state back into UConverter */
3644    cnv->fromUChar32=c;
3645
3646    /* write back the updated pointers */
3647    pArgs->source=source;
3648    pArgs->target=(char *)target;
3649    pArgs->offsets=offsets;
3650}
3651
3652/*
3653 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3654 * that map only to and from the BMP.
3655 * In addition to single-byte/state optimizations, the offset calculations
3656 * become much easier.
3657 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3658 * but measurements have shown that this diminishes performance
3659 * in more cases than it improves it.
3660 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3661 * for various MBCS and SBCS optimizations.
3662 */
3663static void
3664ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3665                              UErrorCode *pErrorCode) {
3666    UConverter *cnv;
3667    const UChar *source, *sourceLimit, *lastSource;
3668    uint8_t *target;
3669    int32_t targetCapacity, length;
3670    int32_t *offsets;
3671
3672    const uint16_t *table;
3673    const uint16_t *results;
3674
3675    UChar32 c;
3676
3677    int32_t sourceIndex;
3678
3679    uint32_t asciiRoundtrips;
3680    uint16_t value, minValue;
3681
3682    /* set up the local pointers */
3683    cnv=pArgs->converter;
3684    source=pArgs->source;
3685    sourceLimit=pArgs->sourceLimit;
3686    target=(uint8_t *)pArgs->target;
3687    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3688    offsets=pArgs->offsets;
3689
3690    table=cnv->sharedData->mbcs.fromUnicodeTable;
3691    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3692        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3693    } else {
3694        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3695    }
3696    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3697
3698    if(cnv->useFallback) {
3699        /* use all roundtrip and fallback results */
3700        minValue=0x800;
3701    } else {
3702        /* use only roundtrips and fallbacks from private-use characters */
3703        minValue=0xc00;
3704    }
3705
3706    /* get the converter state from UConverter */
3707    c=cnv->fromUChar32;
3708
3709    /* sourceIndex=-1 if the current character began in the previous buffer */
3710    sourceIndex= c==0 ? 0 : -1;
3711    lastSource=source;
3712
3713    /*
3714     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3715     * for the minimum of the sourceLength and targetCapacity
3716     */
3717    length=(int32_t)(sourceLimit-source);
3718    if(length<targetCapacity) {
3719        targetCapacity=length;
3720    }
3721
3722    /* conversion loop */
3723    if(c!=0 && targetCapacity>0) {
3724        goto getTrail;
3725    }
3726
3727#if MBCS_UNROLL_SINGLE_FROM_BMP
3728    /* unrolling makes it slower on Pentium III/Windows 2000?! */
3729    /* unroll the loop with the most common case */
3730unrolled:
3731    if(targetCapacity>=4) {
3732        int32_t count, loops;
3733        uint16_t andedValues;
3734
3735        loops=count=targetCapacity>>2;
3736        do {
3737            c=*source++;
3738            andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3739            *target++=(uint8_t)value;
3740            c=*source++;
3741            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3742            *target++=(uint8_t)value;
3743            c=*source++;
3744            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3745            *target++=(uint8_t)value;
3746            c=*source++;
3747            andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3748            *target++=(uint8_t)value;
3749
3750            /* were all 4 entries really valid? */
3751            if(andedValues<minValue) {
3752                /* no, return to the first of these 4 */
3753                source-=4;
3754                target-=4;
3755                break;
3756            }
3757        } while(--count>0);
3758        count=loops-count;
3759        targetCapacity-=4*count;
3760
3761        if(offsets!=NULL) {
3762            lastSource+=4*count;
3763            while(count>0) {
3764                *offsets++=sourceIndex++;
3765                *offsets++=sourceIndex++;
3766                *offsets++=sourceIndex++;
3767                *offsets++=sourceIndex++;
3768                --count;
3769            }
3770        }
3771
3772        c=0;
3773    }
3774#endif
3775
3776    while(targetCapacity>0) {
3777        /*
3778         * Get a correct Unicode code point:
3779         * a single UChar for a BMP code point or
3780         * a matched surrogate pair for a "supplementary code point".
3781         */
3782        c=*source++;
3783        /*
3784         * Do not immediately check for single surrogates:
3785         * Assume that they are unassigned and check for them in that case.
3786         * This speeds up the conversion of assigned characters.
3787         */
3788        /* convert the Unicode code point in c into codepage bytes */
3789        if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3790            *target++=(uint8_t)c;
3791            --targetCapacity;
3792            c=0;
3793            continue;
3794        }
3795        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3796        /* is this code point assigned, or do we use fallbacks? */
3797        if(value>=minValue) {
3798            /* assigned, write the output character bytes from value and length */
3799            /* length==1 */
3800            /* this is easy because we know that there is enough space */
3801            *target++=(uint8_t)value;
3802            --targetCapacity;
3803
3804            /* normal end of conversion: prepare for a new character */
3805            c=0;
3806            continue;
3807        } else if(!UTF_IS_SURROGATE(c)) {
3808            /* normal, unassigned BMP character */
3809        } else if(UTF_IS_SURROGATE_FIRST(c)) {
3810getTrail:
3811            if(source<sourceLimit) {
3812                /* test the following code unit */
3813                UChar trail=*source;
3814                if(UTF_IS_SECOND_SURROGATE(trail)) {
3815                    ++source;
3816                    c=UTF16_GET_PAIR_VALUE(c, trail);
3817                    /* this codepage does not map supplementary code points */
3818                    /* callback(unassigned) */
3819                } else {
3820                    /* this is an unmatched lead code unit (1st surrogate) */
3821                    /* callback(illegal) */
3822                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3823                    break;
3824                }
3825            } else {
3826                /* no more input */
3827                if (pArgs->flush) {
3828                    *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3829                }
3830                break;
3831            }
3832        } else {
3833            /* this is an unmatched trail code unit (2nd surrogate) */
3834            /* callback(illegal) */
3835            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3836            break;
3837        }
3838
3839        /* c does not have a mapping */
3840
3841        /* get the number of code units for c to correctly advance sourceIndex */
3842        length=U16_LENGTH(c);
3843
3844        /* set offsets since the start or the last extension */
3845        if(offsets!=NULL) {
3846            int32_t count=(int32_t)(source-lastSource);
3847
3848            /* do not set the offset for this character */
3849            count-=length;
3850
3851            while(count>0) {
3852                *offsets++=sourceIndex++;
3853                --count;
3854            }
3855            /* offsets and sourceIndex are now set for the current character */
3856        }
3857
3858        /* try an extension mapping */
3859        lastSource=source;
3860        c=_extFromU(cnv, cnv->sharedData,
3861                    c, &source, sourceLimit,
3862                    &target, (const uint8_t *)(pArgs->targetLimit),
3863                    &offsets, sourceIndex,
3864                    pArgs->flush,
3865                    pErrorCode);
3866        sourceIndex+=length+(int32_t)(source-lastSource);
3867        lastSource=source;
3868
3869        if(U_FAILURE(*pErrorCode)) {
3870            /* not mappable or buffer overflow */
3871            break;
3872        } else {
3873            /* a mapping was written to the target, continue */
3874
3875            /* recalculate the targetCapacity after an extension mapping */
3876            targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3877            length=(int32_t)(sourceLimit-source);
3878            if(length<targetCapacity) {
3879                targetCapacity=length;
3880            }
3881        }
3882
3883#if MBCS_UNROLL_SINGLE_FROM_BMP
3884        /* unrolling makes it slower on Pentium III/Windows 2000?! */
3885        goto unrolled;
3886#endif
3887    }
3888
3889    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3890        /* target is full */
3891        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3892    }
3893
3894    /* set offsets since the start or the last callback */
3895    if(offsets!=NULL) {
3896        size_t count=source-lastSource;
3897        if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
3898            /*
3899            Caller gave us a partial supplementary character,
3900            which this function couldn't convert in any case.
3901            The callback will handle the offset.
3902            */
3903            count--;
3904        }
3905        while(count>0) {
3906            *offsets++=sourceIndex++;
3907            --count;
3908        }
3909    }
3910
3911    /* set the converter state back into UConverter */
3912    cnv->fromUChar32=c;
3913
3914    /* write back the updated pointers */
3915    pArgs->source=source;
3916    pArgs->target=(char *)target;
3917    pArgs->offsets=offsets;
3918}
3919
3920U_CFUNC void
3921ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3922                            UErrorCode *pErrorCode) {
3923    UConverter *cnv;
3924    const UChar *source, *sourceLimit;
3925    uint8_t *target;
3926    int32_t targetCapacity;
3927    int32_t *offsets;
3928
3929    const uint16_t *table;
3930    const uint16_t *mbcsIndex;
3931    const uint8_t *p, *bytes;
3932    uint8_t outputType;
3933
3934    UChar32 c;
3935
3936    int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3937
3938    uint32_t stage2Entry;
3939    uint32_t asciiRoundtrips;
3940    uint32_t value;
3941    uint8_t si_value[2] = {0, 0};
3942    uint8_t so_value[2] = {0, 0};
3943    uint8_t si_value_length, so_value_length;
3944    int32_t length = 0, prevLength;
3945    uint8_t unicodeMask;
3946
3947    cnv=pArgs->converter;
3948
3949    if(cnv->preFromUFirstCP>=0) {
3950        /*
3951         * pass sourceIndex=-1 because we continue from an earlier buffer
3952         * in the future, this may change with continuous offsets
3953         */
3954        ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3955
3956        if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3957            return;
3958        }
3959    }
3960
3961    /* use optimized function if possible */
3962    outputType=cnv->sharedData->mbcs.outputType;
3963    unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3964    if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3965        if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3966            ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3967        } else {
3968            ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3969        }
3970        return;
3971    } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
3972        ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3973        return;
3974    }
3975
3976    /* set up the local pointers */
3977    source=pArgs->source;
3978    sourceLimit=pArgs->sourceLimit;
3979    target=(uint8_t *)pArgs->target;
3980    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3981    offsets=pArgs->offsets;
3982
3983    table=cnv->sharedData->mbcs.fromUnicodeTable;
3984    if(cnv->sharedData->mbcs.utf8Friendly) {
3985        mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3986    } else {
3987        mbcsIndex=NULL;
3988    }
3989    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3990        bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3991    } else {
3992        bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3993    }
3994    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3995
3996    /* get the converter state from UConverter */
3997    c=cnv->fromUChar32;
3998
3999    if(outputType==MBCS_OUTPUT_2_SISO) {
4000        prevLength=cnv->fromUnicodeStatus;
4001        if(prevLength==0) {
4002            /* set the real value */
4003            prevLength=1;
4004        }
4005    } else {
4006        /* prevent fromUnicodeStatus from being set to something non-0 */
4007        prevLength=0;
4008    }
4009
4010    /* sourceIndex=-1 if the current character began in the previous buffer */
4011    prevSourceIndex=-1;
4012    sourceIndex= c==0 ? 0 : -1;
4013    nextSourceIndex=0;
4014
4015    /* Get the SI/SO character for the converter */
4016    si_value_length = getSISOBytes(SI, cnv->options, si_value);
4017    so_value_length = getSISOBytes(SO, cnv->options, so_value);
4018
4019    /* conversion loop */
4020    /*
4021     * This is another piece of ugly code:
4022     * A goto into the loop if the converter state contains a first surrogate
4023     * from the previous function call.
4024     * It saves me to check in each loop iteration a check of if(c==0)
4025     * and duplicating the trail-surrogate-handling code in the else
4026     * branch of that check.
4027     * I could not find any other way to get around this other than
4028     * using a function call for the conversion and callback, which would
4029     * be even more inefficient.
4030     *
4031     * Markus Scherer 2000-jul-19
4032     */
4033    if(c!=0 && targetCapacity>0) {
4034        goto getTrail;
4035    }
4036
4037    while(source<sourceLimit) {
4038        /*
4039         * This following test is to see if available input would overflow the output.
4040         * It does not catch output of more than one byte that
4041         * overflows as a result of a multi-byte character or callback output
4042         * from the last source character.
4043         * Therefore, those situations also test for overflows and will
4044         * then break the loop, too.
4045         */
4046        if(targetCapacity>0) {
4047            /*
4048             * Get a correct Unicode code point:
4049             * a single UChar for a BMP code point or
4050             * a matched surrogate pair for a "supplementary code point".
4051             */
4052            c=*source++;
4053            ++nextSourceIndex;
4054            if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
4055                *target++=(uint8_t)c;
4056                if(offsets!=NULL) {
4057                    *offsets++=sourceIndex;
4058                    prevSourceIndex=sourceIndex;
4059                    sourceIndex=nextSourceIndex;
4060                }
4061                --targetCapacity;
4062                c=0;
4063                continue;
4064            }
4065            /*
4066             * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
4067             * to avoid dealing with surrogates.
4068             * MBCS_FAST_MAX must be >=0xd7ff.
4069             */
4070            if(c<=0xd7ff && mbcsIndex!=NULL) {
4071                value=mbcsIndex[c>>6];
4072
4073                /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
4074                /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
4075                switch(outputType) {
4076                case MBCS_OUTPUT_2:
4077                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4078                    if(value<=0xff) {
4079                        if(value==0) {
4080                            goto unassigned;
4081                        } else {
4082                            length=1;
4083                        }
4084                    } else {
4085                        length=2;
4086                    }
4087                    break;
4088                case MBCS_OUTPUT_2_SISO:
4089                    /* 1/2-byte stateful with Shift-In/Shift-Out */
4090                    /*
4091                     * Save the old state in the converter object
4092                     * right here, then change the local prevLength state variable if necessary.
4093                     * Then, if this character turns out to be unassigned or a fallback that
4094                     * is not taken, the callback code must not save the new state in the converter
4095                     * because the new state is for a character that is not output.
4096                     * However, the callback must still restore the state from the converter
4097                     * in case the callback function changed it for its output.
4098                     */
4099                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4100                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4101                    if(value<=0xff) {
4102                        if(value==0) {
4103                            goto unassigned;
4104                        } else if(prevLength<=1) {
4105                            length=1;
4106                        } else {
4107                            /* change from double-byte mode to single-byte */
4108                            if (si_value_length == 1) {
4109                                value|=(uint32_t)si_value[0]<<8;
4110                                length = 2;
4111                            } else if (si_value_length == 2) {
4112                                value|=(uint32_t)si_value[1]<<8;
4113                                value|=(uint32_t)si_value[0]<<16;
4114                                length = 3;
4115                            }
4116                            prevLength=1;
4117                        }
4118                    } else {
4119                        if(prevLength==2) {
4120                            length=2;
4121                        } else {
4122                            /* change from single-byte mode to double-byte */
4123                            if (so_value_length == 1) {
4124                                value|=(uint32_t)so_value[0]<<16;
4125                                length = 3;
4126                            } else if (so_value_length == 2) {
4127                                value|=(uint32_t)so_value[1]<<16;
4128                                value|=(uint32_t)so_value[0]<<24;
4129                                length = 4;
4130                            }
4131                            prevLength=2;
4132                        }
4133                    }
4134                    break;
4135                case MBCS_OUTPUT_DBCS_ONLY:
4136                    /* table with single-byte results, but only DBCS mappings used */
4137                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4138                    if(value<=0xff) {
4139                        /* no mapping or SBCS result, not taken for DBCS-only */
4140                        goto unassigned;
4141                    } else {
4142                        length=2;
4143                    }
4144                    break;
4145                case MBCS_OUTPUT_3:
4146                    p=bytes+(value+(c&0x3f))*3;
4147                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4148                    if(value<=0xff) {
4149                        if(value==0) {
4150                            goto unassigned;
4151                        } else {
4152                            length=1;
4153                        }
4154                    } else if(value<=0xffff) {
4155                        length=2;
4156                    } else {
4157                        length=3;
4158                    }
4159                    break;
4160                case MBCS_OUTPUT_4:
4161                    value=((const uint32_t *)bytes)[value +(c&0x3f)];
4162                    if(value<=0xff) {
4163                        if(value==0) {
4164                            goto unassigned;
4165                        } else {
4166                            length=1;
4167                        }
4168                    } else if(value<=0xffff) {
4169                        length=2;
4170                    } else if(value<=0xffffff) {
4171                        length=3;
4172                    } else {
4173                        length=4;
4174                    }
4175                    break;
4176                case MBCS_OUTPUT_3_EUC:
4177                    value=((const uint16_t *)bytes)[value +(c&0x3f)];
4178                    /* EUC 16-bit fixed-length representation */
4179                    if(value<=0xff) {
4180                        if(value==0) {
4181                            goto unassigned;
4182                        } else {
4183                            length=1;
4184                        }
4185                    } else if((value&0x8000)==0) {
4186                        value|=0x8e8000;
4187                        length=3;
4188                    } else if((value&0x80)==0) {
4189                        value|=0x8f0080;
4190                        length=3;
4191                    } else {
4192                        length=2;
4193                    }
4194                    break;
4195                case MBCS_OUTPUT_4_EUC:
4196                    p=bytes+(value+(c&0x3f))*3;
4197                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4198                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
4199                    if(value<=0xff) {
4200                        if(value==0) {
4201                            goto unassigned;
4202                        } else {
4203                            length=1;
4204                        }
4205                    } else if(value<=0xffff) {
4206                        length=2;
4207                    } else if((value&0x800000)==0) {
4208                        value|=0x8e800000;
4209                        length=4;
4210                    } else if((value&0x8000)==0) {
4211                        value|=0x8f008000;
4212                        length=4;
4213                    } else {
4214                        length=3;
4215                    }
4216                    break;
4217                default:
4218                    /* must not occur */
4219                    /*
4220                     * To avoid compiler warnings that value & length may be
4221                     * used without having been initialized, we set them here.
4222                     * In reality, this is unreachable code.
4223                     * Not having a default branch also causes warnings with
4224                     * some compilers.
4225                     */
4226                    value=0;
4227                    length=0;
4228                    break;
4229                }
4230                /* output the value */
4231            } else {
4232                /*
4233                 * This also tests if the codepage maps single surrogates.
4234                 * If it does, then surrogates are not paired but mapped separately.
4235                 * Note that in this case unmatched surrogates are not detected.
4236                 */
4237                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4238                    if(UTF_IS_SURROGATE_FIRST(c)) {
4239getTrail:
4240                        if(source<sourceLimit) {
4241                            /* test the following code unit */
4242                            UChar trail=*source;
4243                            if(UTF_IS_SECOND_SURROGATE(trail)) {
4244                                ++source;
4245                                ++nextSourceIndex;
4246                                c=UTF16_GET_PAIR_VALUE(c, trail);
4247                                if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4248                                    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4249                                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4250                                    /* callback(unassigned) */
4251                                    goto unassigned;
4252                                }
4253                                /* convert this supplementary code point */
4254                                /* exit this condition tree */
4255                            } else {
4256                                /* this is an unmatched lead code unit (1st surrogate) */
4257                                /* callback(illegal) */
4258                                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4259                                break;
4260                            }
4261                        } else {
4262                            /* no more input */
4263                            break;
4264                        }
4265                    } else {
4266                        /* this is an unmatched trail code unit (2nd surrogate) */
4267                        /* callback(illegal) */
4268                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4269                        break;
4270                    }
4271                }
4272
4273                /* convert the Unicode code point in c into codepage bytes */
4274
4275                /*
4276                 * The basic lookup is a triple-stage compact array (trie) lookup.
4277                 * For details see the beginning of this file.
4278                 *
4279                 * Single-byte codepages are handled with a different data structure
4280                 * by _MBCSSingle... functions.
4281                 *
4282                 * The result consists of a 32-bit value from stage 2 and
4283                 * a pointer to as many bytes as are stored per character.
4284                 * The pointer points to the character's bytes in stage 3.
4285                 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4286                 * for that pointer, while bits 31..16 are flags for which of
4287                 * the 16 characters in the block are roundtrip-assigned.
4288                 *
4289                 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4290                 * respectively as uint32_t, in the platform encoding.
4291                 * For 3-byte codepages, the bytes are always stored in big-endian order.
4292                 *
4293                 * For EUC encodings that use only either 0x8e or 0x8f as the first
4294                 * byte of their longest byte sequences, the first two bytes in
4295                 * this third stage indicate with their 7th bits whether these bytes
4296                 * are to be written directly or actually need to be preceeded by
4297                 * one of the two Single-Shift codes. With this, the third stage
4298                 * stores one byte fewer per character than the actual maximum length of
4299                 * EUC byte sequences.
4300                 *
4301                 * Other than that, leading zero bytes are removed and the other
4302                 * bytes output. A single zero byte may be output if the "assigned"
4303                 * bit in stage 2 was on.
4304                 * The data structure does not support zero byte output as a fallback,
4305                 * and also does not allow output of leading zeros.
4306                 */
4307                stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4308
4309                /* get the bytes and the length for the output */
4310                switch(outputType) {
4311                case MBCS_OUTPUT_2:
4312                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4313                    if(value<=0xff) {
4314                        length=1;
4315                    } else {
4316                        length=2;
4317                    }
4318                    break;
4319                case MBCS_OUTPUT_2_SISO:
4320                    /* 1/2-byte stateful with Shift-In/Shift-Out */
4321                    /*
4322                     * Save the old state in the converter object
4323                     * right here, then change the local prevLength state variable if necessary.
4324                     * Then, if this character turns out to be unassigned or a fallback that
4325                     * is not taken, the callback code must not save the new state in the converter
4326                     * because the new state is for a character that is not output.
4327                     * However, the callback must still restore the state from the converter
4328                     * in case the callback function changed it for its output.
4329                     */
4330                    cnv->fromUnicodeStatus=prevLength; /* save the old state */
4331                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4332                    if(value<=0xff) {
4333                        if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4334                            /* no mapping, leave value==0 */
4335                            length=0;
4336                        } else if(prevLength<=1) {
4337                            length=1;
4338                        } else {
4339                            /* change from double-byte mode to single-byte */
4340                            if (si_value_length == 1) {
4341                                value|=(uint32_t)si_value[0]<<8;
4342                                length = 2;
4343                            } else if (si_value_length == 2) {
4344                                value|=(uint32_t)si_value[1]<<8;
4345                                value|=(uint32_t)si_value[0]<<16;
4346                                length = 3;
4347                            }
4348                            prevLength=1;
4349                        }
4350                    } else {
4351                        if(prevLength==2) {
4352                            length=2;
4353                        } else {
4354                            /* change from single-byte mode to double-byte */
4355                            if (so_value_length == 1) {
4356                                value|=(uint32_t)so_value[0]<<16;
4357                                length = 3;
4358                            } else if (so_value_length == 2) {
4359                                value|=(uint32_t)so_value[1]<<16;
4360                                value|=(uint32_t)so_value[0]<<24;
4361                                length = 4;
4362                            }
4363                            prevLength=2;
4364                        }
4365                    }
4366                    break;
4367                case MBCS_OUTPUT_DBCS_ONLY:
4368                    /* table with single-byte results, but only DBCS mappings used */
4369                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4370                    if(value<=0xff) {
4371                        /* no mapping or SBCS result, not taken for DBCS-only */
4372                        value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4373                        length=0;
4374                    } else {
4375                        length=2;
4376                    }
4377                    break;
4378                case MBCS_OUTPUT_3:
4379                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4380                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4381                    if(value<=0xff) {
4382                        length=1;
4383                    } else if(value<=0xffff) {
4384                        length=2;
4385                    } else {
4386                        length=3;
4387                    }
4388                    break;
4389                case MBCS_OUTPUT_4:
4390                    value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4391                    if(value<=0xff) {
4392                        length=1;
4393                    } else if(value<=0xffff) {
4394                        length=2;
4395                    } else if(value<=0xffffff) {
4396                        length=3;
4397                    } else {
4398                        length=4;
4399                    }
4400                    break;
4401                case MBCS_OUTPUT_3_EUC:
4402                    value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4403                    /* EUC 16-bit fixed-length representation */
4404                    if(value<=0xff) {
4405                        length=1;
4406                    } else if((value&0x8000)==0) {
4407                        value|=0x8e8000;
4408                        length=3;
4409                    } else if((value&0x80)==0) {
4410                        value|=0x8f0080;
4411                        length=3;
4412                    } else {
4413                        length=2;
4414                    }
4415                    break;
4416                case MBCS_OUTPUT_4_EUC:
4417                    p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4418                    value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4419                    /* EUC 16-bit fixed-length representation applied to the first two bytes */
4420                    if(value<=0xff) {
4421                        length=1;
4422                    } else if(value<=0xffff) {
4423                        length=2;
4424                    } else if((value&0x800000)==0) {
4425                        value|=0x8e800000;
4426                        length=4;
4427                    } else if((value&0x8000)==0) {
4428                        value|=0x8f008000;
4429                        length=4;
4430                    } else {
4431                        length=3;
4432                    }
4433                    break;
4434                default:
4435                    /* must not occur */
4436                    /*
4437                     * To avoid compiler warnings that value & length may be
4438                     * used without having been initialized, we set them here.
4439                     * In reality, this is unreachable code.
4440                     * Not having a default branch also causes warnings with
4441                     * some compilers.
4442                     */
4443                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4444                    length=0;
4445                    break;
4446                }
4447
4448                /* is this code point assigned, or do we use fallbacks? */
4449                if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4450                     (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4451                ) {
4452                    /*
4453                     * We allow a 0 byte output if the "assigned" bit is set for this entry.
4454                     * There is no way with this data structure for fallback output
4455                     * to be a zero byte.
4456                     */
4457
4458unassigned:
4459                    /* try an extension mapping */
4460                    pArgs->source=source;
4461                    c=_extFromU(cnv, cnv->sharedData,
4462                                c, &source, sourceLimit,
4463                                &target, target+targetCapacity,
4464                                &offsets, sourceIndex,
4465                                pArgs->flush,
4466                                pErrorCode);
4467                    nextSourceIndex+=(int32_t)(source-pArgs->source);
4468                    prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4469
4470                    if(U_FAILURE(*pErrorCode)) {
4471                        /* not mappable or buffer overflow */
4472                        break;
4473                    } else {
4474                        /* a mapping was written to the target, continue */
4475
4476                        /* recalculate the targetCapacity after an extension mapping */
4477                        targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4478
4479                        /* normal end of conversion: prepare for a new character */
4480                        if(offsets!=NULL) {
4481                            prevSourceIndex=sourceIndex;
4482                            sourceIndex=nextSourceIndex;
4483                        }
4484                        continue;
4485                    }
4486                }
4487            }
4488
4489            /* write the output character bytes from value and length */
4490            /* from the first if in the loop we know that targetCapacity>0 */
4491            if(length<=targetCapacity) {
4492                if(offsets==NULL) {
4493                    switch(length) {
4494                        /* each branch falls through to the next one */
4495                    case 4:
4496                        *target++=(uint8_t)(value>>24);
4497                    case 3:
4498                        *target++=(uint8_t)(value>>16);
4499                    case 2:
4500                        *target++=(uint8_t)(value>>8);
4501                    case 1:
4502                        *target++=(uint8_t)value;
4503                    default:
4504                        /* will never occur */
4505                        break;
4506                    }
4507                } else {
4508                    switch(length) {
4509                        /* each branch falls through to the next one */
4510                    case 4:
4511                        *target++=(uint8_t)(value>>24);
4512                        *offsets++=sourceIndex;
4513                    case 3:
4514                        *target++=(uint8_t)(value>>16);
4515                        *offsets++=sourceIndex;
4516                    case 2:
4517                        *target++=(uint8_t)(value>>8);
4518                        *offsets++=sourceIndex;
4519                    case 1:
4520                        *target++=(uint8_t)value;
4521                        *offsets++=sourceIndex;
4522                    default:
4523                        /* will never occur */
4524                        break;
4525                    }
4526                }
4527                targetCapacity-=length;
4528            } else {
4529                uint8_t *charErrorBuffer;
4530
4531                /*
4532                 * We actually do this backwards here:
4533                 * In order to save an intermediate variable, we output
4534                 * first to the overflow buffer what does not fit into the
4535                 * regular target.
4536                 */
4537                /* we know that 1<=targetCapacity<length<=4 */
4538                length-=targetCapacity;
4539                charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4540                switch(length) {
4541                    /* each branch falls through to the next one */
4542                case 3:
4543                    *charErrorBuffer++=(uint8_t)(value>>16);
4544                case 2:
4545                    *charErrorBuffer++=(uint8_t)(value>>8);
4546                case 1:
4547                    *charErrorBuffer=(uint8_t)value;
4548                default:
4549                    /* will never occur */
4550                    break;
4551                }
4552                cnv->charErrorBufferLength=(int8_t)length;
4553
4554                /* now output what fits into the regular target */
4555                value>>=8*length; /* length was reduced by targetCapacity */
4556                switch(targetCapacity) {
4557                    /* each branch falls through to the next one */
4558                case 3:
4559                    *target++=(uint8_t)(value>>16);
4560                    if(offsets!=NULL) {
4561                        *offsets++=sourceIndex;
4562                    }
4563                case 2:
4564                    *target++=(uint8_t)(value>>8);
4565                    if(offsets!=NULL) {
4566                        *offsets++=sourceIndex;
4567                    }
4568                case 1:
4569                    *target++=(uint8_t)value;
4570                    if(offsets!=NULL) {
4571                        *offsets++=sourceIndex;
4572                    }
4573                default:
4574                    /* will never occur */
4575                    break;
4576                }
4577
4578                /* target overflow */
4579                targetCapacity=0;
4580                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4581                c=0;
4582                break;
4583            }
4584
4585            /* normal end of conversion: prepare for a new character */
4586            c=0;
4587            if(offsets!=NULL) {
4588                prevSourceIndex=sourceIndex;
4589                sourceIndex=nextSourceIndex;
4590            }
4591            continue;
4592        } else {
4593            /* target is full */
4594            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4595            break;
4596        }
4597    }
4598
4599    /*
4600     * the end of the input stream and detection of truncated input
4601     * are handled by the framework, but for EBCDIC_STATEFUL conversion
4602     * we need to emit an SI at the very end
4603     *
4604     * conditions:
4605     *   successful
4606     *   EBCDIC_STATEFUL in DBCS mode
4607     *   end of input and no truncated input
4608     */
4609    if( U_SUCCESS(*pErrorCode) &&
4610        outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4611        pArgs->flush && source>=sourceLimit && c==0
4612    ) {
4613        /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4614        if(targetCapacity>0) {
4615            *target++=(uint8_t)si_value[0];
4616            if (si_value_length == 2) {
4617                if (targetCapacity<2) {
4618                    cnv->charErrorBuffer[0]=(uint8_t)si_value[1];
4619                    cnv->charErrorBufferLength=1;
4620                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4621                } else {
4622                    *target++=(uint8_t)si_value[1];
4623                }
4624            }
4625            if(offsets!=NULL) {
4626                /* set the last source character's index (sourceIndex points at sourceLimit now) */
4627                *offsets++=prevSourceIndex;
4628            }
4629        } else {
4630            /* target is full */
4631            cnv->charErrorBuffer[0]=(uint8_t)si_value[0];
4632            if (si_value_length == 2) {
4633                cnv->charErrorBuffer[1]=(uint8_t)si_value[1];
4634            }
4635            cnv->charErrorBufferLength=si_value_length;
4636            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4637        }
4638        prevLength=1; /* we switched into SBCS */
4639    }
4640
4641    /* set the converter state back into UConverter */
4642    cnv->fromUChar32=c;
4643    cnv->fromUnicodeStatus=prevLength;
4644
4645    /* write back the updated pointers */
4646    pArgs->source=source;
4647    pArgs->target=(char *)target;
4648    pArgs->offsets=offsets;
4649}
4650
4651/*
4652 * This is another simple conversion function for internal use by other
4653 * conversion implementations.
4654 * It does not use the converter state nor call callbacks.
4655 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4656 * It handles conversion extensions but not GB 18030.
4657 *
4658 * It converts one single Unicode code point into codepage bytes, encoded
4659 * as one 32-bit value. The function returns the number of bytes in *pValue:
4660 * 1..4 the number of bytes in *pValue
4661 * 0    unassigned (*pValue undefined)
4662 * -1   illegal (currently not used, *pValue undefined)
4663 *
4664 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4665 * the second to last byte in bits 15..8, etc.
4666 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4667 */
4668U_CFUNC int32_t
4669ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
4670                 UChar32 c, uint32_t *pValue,
4671                 UBool useFallback) {
4672    const int32_t *cx;
4673    const uint16_t *table;
4674#if 0
4675/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4676    const uint8_t *p;
4677#endif
4678    uint32_t stage2Entry;
4679    uint32_t value;
4680    int32_t length;
4681
4682    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4683    if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4684        table=sharedData->mbcs.fromUnicodeTable;
4685
4686        /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4687        if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4688            value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4689            /* is this code point assigned, or do we use fallbacks? */
4690            if(useFallback ? value>=0x800 : value>=0xc00) {
4691                *pValue=value&0xff;
4692                return 1;
4693            }
4694        } else /* outputType!=MBCS_OUTPUT_1 */ {
4695            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4696
4697            /* get the bytes and the length for the output */
4698            switch(sharedData->mbcs.outputType) {
4699            case MBCS_OUTPUT_2:
4700                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4701                if(value<=0xff) {
4702                    length=1;
4703                } else {
4704                    length=2;
4705                }
4706                break;
4707#if 0
4708/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4709            case MBCS_OUTPUT_DBCS_ONLY:
4710                /* table with single-byte results, but only DBCS mappings used */
4711                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4712                if(value<=0xff) {
4713                    /* no mapping or SBCS result, not taken for DBCS-only */
4714                    value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4715                    length=0;
4716                } else {
4717                    length=2;
4718                }
4719                break;
4720            case MBCS_OUTPUT_3:
4721                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4722                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4723                if(value<=0xff) {
4724                    length=1;
4725                } else if(value<=0xffff) {
4726                    length=2;
4727                } else {
4728                    length=3;
4729                }
4730                break;
4731            case MBCS_OUTPUT_4:
4732                value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4733                if(value<=0xff) {
4734                    length=1;
4735                } else if(value<=0xffff) {
4736                    length=2;
4737                } else if(value<=0xffffff) {
4738                    length=3;
4739                } else {
4740                    length=4;
4741                }
4742                break;
4743            case MBCS_OUTPUT_3_EUC:
4744                value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4745                /* EUC 16-bit fixed-length representation */
4746                if(value<=0xff) {
4747                    length=1;
4748                } else if((value&0x8000)==0) {
4749                    value|=0x8e8000;
4750                    length=3;
4751                } else if((value&0x80)==0) {
4752                    value|=0x8f0080;
4753                    length=3;
4754                } else {
4755                    length=2;
4756                }
4757                break;
4758            case MBCS_OUTPUT_4_EUC:
4759                p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4760                value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4761                /* EUC 16-bit fixed-length representation applied to the first two bytes */
4762                if(value<=0xff) {
4763                    length=1;
4764                } else if(value<=0xffff) {
4765                    length=2;
4766                } else if((value&0x800000)==0) {
4767                    value|=0x8e800000;
4768                    length=4;
4769                } else if((value&0x8000)==0) {
4770                    value|=0x8f008000;
4771                    length=4;
4772                } else {
4773                    length=3;
4774                }
4775                break;
4776#endif
4777            default:
4778                /* must not occur */
4779                return -1;
4780            }
4781
4782            /* is this code point assigned, or do we use fallbacks? */
4783            if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4784                (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4785            ) {
4786                /*
4787                 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4788                 * There is no way with this data structure for fallback output
4789                 * to be a zero byte.
4790                 */
4791                /* assigned */
4792                *pValue=value;
4793                return length;
4794            }
4795        }
4796    }
4797
4798    cx=sharedData->mbcs.extIndexes;
4799    if(cx!=NULL) {
4800        length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4801        return length>=0 ? length : -length;  /* return abs(length); */
4802    }
4803
4804    /* unassigned */
4805    return 0;
4806}
4807
4808
4809#if 0
4810/*
4811 * This function has been moved to ucnv2022.c for inlining.
4812 * This implementation is here only for documentation purposes
4813 */
4814
4815/**
4816 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4817 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4818 * It does not handle conversion extensions (_extFromU()).
4819 *
4820 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4821 */
4822U_CFUNC int32_t
4823ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
4824                       UChar32 c,
4825                       UBool useFallback) {
4826    const uint16_t *table;
4827    int32_t value;
4828
4829    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4830    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4831        return -1;
4832    }
4833
4834    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4835    table=sharedData->mbcs.fromUnicodeTable;
4836
4837    /* get the byte for the output */
4838    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4839    /* is this code point assigned, or do we use fallbacks? */
4840    if(useFallback ? value>=0x800 : value>=0xc00) {
4841        return value&0xff;
4842    } else {
4843        return -1;
4844    }
4845}
4846#endif
4847
4848/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4849
4850/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4851static const UChar32
4852utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4853
4854/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4855static const UChar32
4856utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4857
4858static void
4859ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4860                  UConverterToUnicodeArgs *pToUArgs,
4861                  UErrorCode *pErrorCode) {
4862    UConverter *utf8, *cnv;
4863    const uint8_t *source, *sourceLimit;
4864    uint8_t *target;
4865    int32_t targetCapacity;
4866
4867    const uint16_t *table, *sbcsIndex;
4868    const uint16_t *results;
4869
4870    int8_t oldToULength, toULength, toULimit;
4871
4872    UChar32 c;
4873    uint8_t b, t1, t2;
4874
4875    uint32_t asciiRoundtrips;
4876    uint16_t value, minValue;
4877    UBool hasSupplementary;
4878
4879    /* set up the local pointers */
4880    utf8=pToUArgs->converter;
4881    cnv=pFromUArgs->converter;
4882    source=(uint8_t *)pToUArgs->source;
4883    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4884    target=(uint8_t *)pFromUArgs->target;
4885    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4886
4887    table=cnv->sharedData->mbcs.fromUnicodeTable;
4888    sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
4889    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4890        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4891    } else {
4892        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4893    }
4894    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4895
4896    if(cnv->useFallback) {
4897        /* use all roundtrip and fallback results */
4898        minValue=0x800;
4899    } else {
4900        /* use only roundtrips and fallbacks from private-use characters */
4901        minValue=0xc00;
4902    }
4903    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4904
4905    /* get the converter state from the UTF-8 UConverter */
4906    c=(UChar32)utf8->toUnicodeStatus;
4907    if(c!=0) {
4908        toULength=oldToULength=utf8->toULength;
4909        toULimit=(int8_t)utf8->mode;
4910    } else {
4911        toULength=oldToULength=toULimit=0;
4912    }
4913
4914    /*
4915     * Make sure that the last byte sequence before sourceLimit is complete
4916     * or runs into a lead byte.
4917     * Do not go back into the bytes that will be read for finishing a partial
4918     * sequence from the previous buffer.
4919     * In the conversion loop compare source with sourceLimit only once
4920     * per multi-byte character.
4921     */
4922    {
4923        int32_t i, length;
4924
4925        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4926        for(i=0; i<3 && i<length;) {
4927            b=*(sourceLimit-i-1);
4928            if(U8_IS_TRAIL(b)) {
4929                ++i;
4930            } else {
4931                if(i<utf8_countTrailBytes[b]) {
4932                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4933                    sourceLimit-=i+1;
4934                }
4935                break;
4936            }
4937        }
4938    }
4939
4940    if(c!=0 && targetCapacity>0) {
4941        utf8->toUnicodeStatus=0;
4942        utf8->toULength=0;
4943        goto moreBytes;
4944        /*
4945         * Note: We could avoid the goto by duplicating some of the moreBytes
4946         * code, but only up to the point of collecting a complete UTF-8
4947         * sequence; then recurse for the toUBytes[toULength]
4948         * and then continue with normal conversion.
4949         *
4950         * If so, move this code to just after initializing the minimum
4951         * set of local variables for reading the UTF-8 input
4952         * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4953         *
4954         * Potential advantages:
4955         * - avoid the goto
4956         * - oldToULength could become a local variable in just those code blocks
4957         *   that deal with buffer boundaries
4958         * - possibly faster if the goto prevents some compiler optimizations
4959         *   (this would need measuring to confirm)
4960         * Disadvantage:
4961         * - code duplication
4962         */
4963    }
4964
4965    /* conversion loop */
4966    while(source<sourceLimit) {
4967        if(targetCapacity>0) {
4968            b=*source++;
4969            if((int8_t)b>=0) {
4970                /* convert ASCII */
4971                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4972                    *target++=(uint8_t)b;
4973                    --targetCapacity;
4974                    continue;
4975                } else {
4976                    c=b;
4977                    value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
4978                }
4979            } else {
4980                if(b<0xe0) {
4981                    if( /* handle U+0080..U+07FF inline */
4982                        b>=0xc2 &&
4983                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
4984                    ) {
4985                        c=b&0x1f;
4986                        ++source;
4987                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
4988                        if(value>=minValue) {
4989                            *target++=(uint8_t)value;
4990                            --targetCapacity;
4991                            continue;
4992                        } else {
4993                            c=(c<<6)|t1;
4994                        }
4995                    } else {
4996                        c=-1;
4997                    }
4998                } else if(b==0xe0) {
4999                    if( /* handle U+0800..U+0FFF inline */
5000                        (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
5001                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5002                    ) {
5003                        c=t1;
5004                        source+=2;
5005                        value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
5006                        if(value>=minValue) {
5007                            *target++=(uint8_t)value;
5008                            --targetCapacity;
5009                            continue;
5010                        } else {
5011                            c=(c<<6)|t2;
5012                        }
5013                    } else {
5014                        c=-1;
5015                    }
5016                } else {
5017                    c=-1;
5018                }
5019
5020                if(c<0) {
5021                    /* handle "complicated" and error cases, and continuing partial characters */
5022                    oldToULength=0;
5023                    toULength=1;
5024                    toULimit=utf8_countTrailBytes[b]+1;
5025                    c=b;
5026moreBytes:
5027                    while(toULength<toULimit) {
5028                        /*
5029                         * The sourceLimit may have been adjusted before the conversion loop
5030                         * to stop before a truncated sequence.
5031                         * Here we need to use the real limit in case we have two truncated
5032                         * sequences at the end.
5033                         * See ticket #7492.
5034                         */
5035                        if(source<(uint8_t *)pToUArgs->sourceLimit) {
5036                            b=*source;
5037                            if(U8_IS_TRAIL(b)) {
5038                                ++source;
5039                                ++toULength;
5040                                c=(c<<6)+b;
5041                            } else {
5042                                break; /* sequence too short, stop with toULength<toULimit */
5043                            }
5044                        } else {
5045                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5046                            source-=(toULength-oldToULength);
5047                            while(oldToULength<toULength) {
5048                                utf8->toUBytes[oldToULength++]=*source++;
5049                            }
5050                            utf8->toUnicodeStatus=c;
5051                            utf8->toULength=toULength;
5052                            utf8->mode=toULimit;
5053                            pToUArgs->source=(char *)source;
5054                            pFromUArgs->target=(char *)target;
5055                            return;
5056                        }
5057                    }
5058
5059                    if( toULength==toULimit &&      /* consumed all trail bytes */
5060                        (toULength==3 || toULength==2) &&             /* BMP */
5061                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5062                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
5063                    ) {
5064                        value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5065                    } else if(
5066                        toULength==toULimit && toULength==4 &&
5067                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5068                    ) {
5069                        /* supplementary code point */
5070                        if(!hasSupplementary) {
5071                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5072                            value=0;
5073                        } else {
5074                            value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5075                        }
5076                    } else {
5077                        /* error handling: illegal UTF-8 byte sequence */
5078                        source-=(toULength-oldToULength);
5079                        while(oldToULength<toULength) {
5080                            utf8->toUBytes[oldToULength++]=*source++;
5081                        }
5082                        utf8->toULength=toULength;
5083                        pToUArgs->source=(char *)source;
5084                        pFromUArgs->target=(char *)target;
5085                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5086                        return;
5087                    }
5088                }
5089            }
5090
5091            if(value>=minValue) {
5092                /* output the mapping for c */
5093                *target++=(uint8_t)value;
5094                --targetCapacity;
5095            } else {
5096                /* value<minValue means c is unassigned (unmappable) */
5097                /*
5098                 * Try an extension mapping.
5099                 * Pass in no source because we don't have UTF-16 input.
5100                 * If we have a partial match on c, we will return and revert
5101                 * to UTF-8->UTF-16->charset conversion.
5102                 */
5103                static const UChar nul=0;
5104                const UChar *noSource=&nul;
5105                c=_extFromU(cnv, cnv->sharedData,
5106                            c, &noSource, noSource,
5107                            &target, target+targetCapacity,
5108                            NULL, -1,
5109                            pFromUArgs->flush,
5110                            pErrorCode);
5111
5112                if(U_FAILURE(*pErrorCode)) {
5113                    /* not mappable or buffer overflow */
5114                    cnv->fromUChar32=c;
5115                    break;
5116                } else if(cnv->preFromUFirstCP>=0) {
5117                    /*
5118                     * Partial match, return and revert to pivoting.
5119                     * In normal from-UTF-16 conversion, we would just continue
5120                     * but then exit the loop because the extension match would
5121                     * have consumed the source.
5122                     */
5123                    break;
5124                } else {
5125                    /* a mapping was written to the target, continue */
5126
5127                    /* recalculate the targetCapacity after an extension mapping */
5128                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5129                }
5130            }
5131        } else {
5132            /* target is full */
5133            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5134            break;
5135        }
5136    }
5137
5138    /*
5139     * The sourceLimit may have been adjusted before the conversion loop
5140     * to stop before a truncated sequence.
5141     * If so, then collect the truncated sequence now.
5142     */
5143    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5144        c=utf8->toUBytes[0]=b=*source++;
5145        toULength=1;
5146        toULimit=utf8_countTrailBytes[b]+1;
5147        while(source<sourceLimit) {
5148            utf8->toUBytes[toULength++]=b=*source++;
5149            c=(c<<6)+b;
5150        }
5151        utf8->toUnicodeStatus=c;
5152        utf8->toULength=toULength;
5153        utf8->mode=toULimit;
5154    }
5155
5156    /* write back the updated pointers */
5157    pToUArgs->source=(char *)source;
5158    pFromUArgs->target=(char *)target;
5159}
5160
5161static void
5162ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5163                  UConverterToUnicodeArgs *pToUArgs,
5164                  UErrorCode *pErrorCode) {
5165    UConverter *utf8, *cnv;
5166    const uint8_t *source, *sourceLimit;
5167    uint8_t *target;
5168    int32_t targetCapacity;
5169
5170    const uint16_t *table, *mbcsIndex;
5171    const uint16_t *results;
5172
5173    int8_t oldToULength, toULength, toULimit;
5174
5175    UChar32 c;
5176    uint8_t b, t1, t2;
5177
5178    uint32_t stage2Entry;
5179    uint32_t asciiRoundtrips;
5180    uint16_t value, minValue;
5181    UBool hasSupplementary;
5182
5183    /* set up the local pointers */
5184    utf8=pToUArgs->converter;
5185    cnv=pFromUArgs->converter;
5186    source=(uint8_t *)pToUArgs->source;
5187    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5188    target=(uint8_t *)pFromUArgs->target;
5189    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5190
5191    table=cnv->sharedData->mbcs.fromUnicodeTable;
5192    mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5193    if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5194        results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5195    } else {
5196        results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5197    }
5198    asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5199
5200    if(cnv->useFallback) {
5201        /* use all roundtrip and fallback results */
5202        minValue=0x800;
5203    } else {
5204        /* use only roundtrips and fallbacks from private-use characters */
5205        minValue=0xc00;
5206    }
5207    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5208
5209    /* get the converter state from the UTF-8 UConverter */
5210    c=(UChar32)utf8->toUnicodeStatus;
5211    if(c!=0) {
5212        toULength=oldToULength=utf8->toULength;
5213        toULimit=(int8_t)utf8->mode;
5214    } else {
5215        toULength=oldToULength=toULimit=0;
5216    }
5217
5218    /*
5219     * Make sure that the last byte sequence before sourceLimit is complete
5220     * or runs into a lead byte.
5221     * Do not go back into the bytes that will be read for finishing a partial
5222     * sequence from the previous buffer.
5223     * In the conversion loop compare source with sourceLimit only once
5224     * per multi-byte character.
5225     */
5226    {
5227        int32_t i, length;
5228
5229        length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5230        for(i=0; i<3 && i<length;) {
5231            b=*(sourceLimit-i-1);
5232            if(U8_IS_TRAIL(b)) {
5233                ++i;
5234            } else {
5235                if(i<utf8_countTrailBytes[b]) {
5236                    /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5237                    sourceLimit-=i+1;
5238                }
5239                break;
5240            }
5241        }
5242    }
5243
5244    if(c!=0 && targetCapacity>0) {
5245        utf8->toUnicodeStatus=0;
5246        utf8->toULength=0;
5247        goto moreBytes;
5248        /* See note in ucnv_SBCSFromUTF8() about this goto. */
5249    }
5250
5251    /* conversion loop */
5252    while(source<sourceLimit) {
5253        if(targetCapacity>0) {
5254            b=*source++;
5255            if((int8_t)b>=0) {
5256                /* convert ASCII */
5257                if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5258                    *target++=b;
5259                    --targetCapacity;
5260                    continue;
5261                } else {
5262                    value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
5263                    if(value==0) {
5264                        c=b;
5265                        goto unassigned;
5266                    }
5267                }
5268            } else {
5269                if(b>0xe0) {
5270                    if( /* handle U+1000..U+D7FF inline */
5271                        (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
5272                                                        (b==0xed && (t1 <= 0x1f))) &&
5273                        (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5274                    ) {
5275                        c=((b&0xf)<<6)|t1;
5276                        source+=2;
5277                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5278                        if(value==0) {
5279                            c=(c<<6)|t2;
5280                            goto unassigned;
5281                        }
5282                    } else {
5283                        c=-1;
5284                    }
5285                } else if(b<0xe0) {
5286                    if( /* handle U+0080..U+07FF inline */
5287                        b>=0xc2 &&
5288                        (t1=(uint8_t)(*source-0x80)) <= 0x3f
5289                    ) {
5290                        c=b&0x1f;
5291                        ++source;
5292                        value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5293                        if(value==0) {
5294                            c=(c<<6)|t1;
5295                            goto unassigned;
5296                        }
5297                    } else {
5298                        c=-1;
5299                    }
5300                } else {
5301                    c=-1;
5302                }
5303
5304                if(c<0) {
5305                    /* handle "complicated" and error cases, and continuing partial characters */
5306                    oldToULength=0;
5307                    toULength=1;
5308                    toULimit=utf8_countTrailBytes[b]+1;
5309                    c=b;
5310moreBytes:
5311                    while(toULength<toULimit) {
5312                        /*
5313                         * The sourceLimit may have been adjusted before the conversion loop
5314                         * to stop before a truncated sequence.
5315                         * Here we need to use the real limit in case we have two truncated
5316                         * sequences at the end.
5317                         * See ticket #7492.
5318                         */
5319                        if(source<(uint8_t *)pToUArgs->sourceLimit) {
5320                            b=*source;
5321                            if(U8_IS_TRAIL(b)) {
5322                                ++source;
5323                                ++toULength;
5324                                c=(c<<6)+b;
5325                            } else {
5326                                break; /* sequence too short, stop with toULength<toULimit */
5327                            }
5328                        } else {
5329                            /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5330                            source-=(toULength-oldToULength);
5331                            while(oldToULength<toULength) {
5332                                utf8->toUBytes[oldToULength++]=*source++;
5333                            }
5334                            utf8->toUnicodeStatus=c;
5335                            utf8->toULength=toULength;
5336                            utf8->mode=toULimit;
5337                            pToUArgs->source=(char *)source;
5338                            pFromUArgs->target=(char *)target;
5339                            return;
5340                        }
5341                    }
5342
5343                    if( toULength==toULimit &&      /* consumed all trail bytes */
5344                        (toULength==3 || toULength==2) &&             /* BMP */
5345                        (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5346                        (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
5347                    ) {
5348                        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5349                    } else if(
5350                        toULength==toULimit && toULength==4 &&
5351                        (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5352                    ) {
5353                        /* supplementary code point */
5354                        if(!hasSupplementary) {
5355                            /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5356                            stage2Entry=0;
5357                        } else {
5358                            stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5359                        }
5360                    } else {
5361                        /* error handling: illegal UTF-8 byte sequence */
5362                        source-=(toULength-oldToULength);
5363                        while(oldToULength<toULength) {
5364                            utf8->toUBytes[oldToULength++]=*source++;
5365                        }
5366                        utf8->toULength=toULength;
5367                        pToUArgs->source=(char *)source;
5368                        pFromUArgs->target=(char *)target;
5369                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5370                        return;
5371                    }
5372
5373                    /* get the bytes and the length for the output */
5374                    /* MBCS_OUTPUT_2 */
5375                    value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5376
5377                    /* is this code point assigned, or do we use fallbacks? */
5378                    if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
5379                         (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
5380                    ) {
5381                        goto unassigned;
5382                    }
5383                }
5384            }
5385
5386            /* write the output character bytes from value and length */
5387            /* from the first if in the loop we know that targetCapacity>0 */
5388            if(value<=0xff) {
5389                /* this is easy because we know that there is enough space */
5390                *target++=(uint8_t)value;
5391                --targetCapacity;
5392            } else /* length==2 */ {
5393                *target++=(uint8_t)(value>>8);
5394                if(2<=targetCapacity) {
5395                    *target++=(uint8_t)value;
5396                    targetCapacity-=2;
5397                } else {
5398                    cnv->charErrorBuffer[0]=(char)value;
5399                    cnv->charErrorBufferLength=1;
5400
5401                    /* target overflow */
5402                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5403                    break;
5404                }
5405            }
5406            continue;
5407
5408unassigned:
5409            {
5410                /*
5411                 * Try an extension mapping.
5412                 * Pass in no source because we don't have UTF-16 input.
5413                 * If we have a partial match on c, we will return and revert
5414                 * to UTF-8->UTF-16->charset conversion.
5415                 */
5416                static const UChar nul=0;
5417                const UChar *noSource=&nul;
5418                c=_extFromU(cnv, cnv->sharedData,
5419                            c, &noSource, noSource,
5420                            &target, target+targetCapacity,
5421                            NULL, -1,
5422                            pFromUArgs->flush,
5423                            pErrorCode);
5424
5425                if(U_FAILURE(*pErrorCode)) {
5426                    /* not mappable or buffer overflow */
5427                    cnv->fromUChar32=c;
5428                    break;
5429                } else if(cnv->preFromUFirstCP>=0) {
5430                    /*
5431                     * Partial match, return and revert to pivoting.
5432                     * In normal from-UTF-16 conversion, we would just continue
5433                     * but then exit the loop because the extension match would
5434                     * have consumed the source.
5435                     */
5436                    break;
5437                } else {
5438                    /* a mapping was written to the target, continue */
5439
5440                    /* recalculate the targetCapacity after an extension mapping */
5441                    targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5442                    continue;
5443                }
5444            }
5445        } else {
5446            /* target is full */
5447            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5448            break;
5449        }
5450    }
5451
5452    /*
5453     * The sourceLimit may have been adjusted before the conversion loop
5454     * to stop before a truncated sequence.
5455     * If so, then collect the truncated sequence now.
5456     */
5457    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5458        c=utf8->toUBytes[0]=b=*source++;
5459        toULength=1;
5460        toULimit=utf8_countTrailBytes[b]+1;
5461        while(source<sourceLimit) {
5462            utf8->toUBytes[toULength++]=b=*source++;
5463            c=(c<<6)+b;
5464        }
5465        utf8->toUnicodeStatus=c;
5466        utf8->toULength=toULength;
5467        utf8->mode=toULimit;
5468    }
5469
5470    /* write back the updated pointers */
5471    pToUArgs->source=(char *)source;
5472    pFromUArgs->target=(char *)target;
5473}
5474
5475/* miscellaneous ------------------------------------------------------------ */
5476
5477static void
5478ucnv_MBCSGetStarters(const UConverter* cnv,
5479                 UBool starters[256],
5480                 UErrorCode *pErrorCode) {
5481    const int32_t *state0;
5482    int i;
5483
5484    state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
5485    for(i=0; i<256; ++i) {
5486        /* all bytes that cause a state transition from state 0 are lead bytes */
5487        starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5488    }
5489}
5490
5491/*
5492 * This is an internal function that allows other converter implementations
5493 * to check whether a byte is a lead byte.
5494 */
5495U_CFUNC UBool
5496ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
5497    return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
5498}
5499
5500static void
5501ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
5502              int32_t offsetIndex,
5503              UErrorCode *pErrorCode) {
5504    UConverter *cnv=pArgs->converter;
5505    char *p, *subchar;
5506    char buffer[4];
5507    int32_t length;
5508
5509    /* first, select between subChar and subChar1 */
5510    if( cnv->subChar1!=0 &&
5511        (cnv->sharedData->mbcs.extIndexes!=NULL ?
5512            cnv->useSubChar1 :
5513            (cnv->invalidUCharBuffer[0]<=0xff))
5514    ) {
5515        /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5516        subchar=(char *)&cnv->subChar1;
5517        length=1;
5518    } else {
5519        /* select subChar in all other cases */
5520        subchar=(char *)cnv->subChars;
5521        length=cnv->subCharLen;
5522    }
5523
5524    /* reset the selector for the next code point */
5525    cnv->useSubChar1=FALSE;
5526
5527    if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
5528        p=buffer;
5529
5530        /* fromUnicodeStatus contains prevLength */
5531        switch(length) {
5532        case 1:
5533            if(cnv->fromUnicodeStatus==2) {
5534                /* DBCS mode and SBCS sub char: change to SBCS */
5535                cnv->fromUnicodeStatus=1;
5536                *p++=UCNV_SI;
5537            }
5538            *p++=subchar[0];
5539            break;
5540        case 2:
5541            if(cnv->fromUnicodeStatus<=1) {
5542                /* SBCS mode and DBCS sub char: change to DBCS */
5543                cnv->fromUnicodeStatus=2;
5544                *p++=UCNV_SO;
5545            }
5546            *p++=subchar[0];
5547            *p++=subchar[1];
5548            break;
5549        default:
5550            *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5551            return;
5552        }
5553        subchar=buffer;
5554        length=(int32_t)(p-buffer);
5555    }
5556
5557    ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
5558}
5559
5560U_CFUNC UConverterType
5561ucnv_MBCSGetType(const UConverter* converter) {
5562    /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
5563    if(converter->sharedData->mbcs.countStates==1) {
5564        return (UConverterType)UCNV_SBCS;
5565    } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
5566        return (UConverterType)UCNV_EBCDIC_STATEFUL;
5567    } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
5568        return (UConverterType)UCNV_DBCS;
5569    }
5570    return (UConverterType)UCNV_MBCS;
5571}
5572
5573static const UConverterImpl _SBCSUTF8Impl={
5574    UCNV_MBCS,
5575
5576    ucnv_MBCSLoad,
5577    ucnv_MBCSUnload,
5578
5579    ucnv_MBCSOpen,
5580    NULL,
5581    NULL,
5582
5583    ucnv_MBCSToUnicodeWithOffsets,
5584    ucnv_MBCSToUnicodeWithOffsets,
5585    ucnv_MBCSFromUnicodeWithOffsets,
5586    ucnv_MBCSFromUnicodeWithOffsets,
5587    ucnv_MBCSGetNextUChar,
5588
5589    ucnv_MBCSGetStarters,
5590    ucnv_MBCSGetName,
5591    ucnv_MBCSWriteSub,
5592    NULL,
5593    ucnv_MBCSGetUnicodeSet,
5594
5595    NULL,
5596    ucnv_SBCSFromUTF8
5597};
5598
5599static const UConverterImpl _DBCSUTF8Impl={
5600    UCNV_MBCS,
5601
5602    ucnv_MBCSLoad,
5603    ucnv_MBCSUnload,
5604
5605    ucnv_MBCSOpen,
5606    NULL,
5607    NULL,
5608
5609    ucnv_MBCSToUnicodeWithOffsets,
5610    ucnv_MBCSToUnicodeWithOffsets,
5611    ucnv_MBCSFromUnicodeWithOffsets,
5612    ucnv_MBCSFromUnicodeWithOffsets,
5613    ucnv_MBCSGetNextUChar,
5614
5615    ucnv_MBCSGetStarters,
5616    ucnv_MBCSGetName,
5617    ucnv_MBCSWriteSub,
5618    NULL,
5619    ucnv_MBCSGetUnicodeSet,
5620
5621    NULL,
5622    ucnv_DBCSFromUTF8
5623};
5624
5625static const UConverterImpl _MBCSImpl={
5626    UCNV_MBCS,
5627
5628    ucnv_MBCSLoad,
5629    ucnv_MBCSUnload,
5630
5631    ucnv_MBCSOpen,
5632    NULL,
5633    NULL,
5634
5635    ucnv_MBCSToUnicodeWithOffsets,
5636    ucnv_MBCSToUnicodeWithOffsets,
5637    ucnv_MBCSFromUnicodeWithOffsets,
5638    ucnv_MBCSFromUnicodeWithOffsets,
5639    ucnv_MBCSGetNextUChar,
5640
5641    ucnv_MBCSGetStarters,
5642    ucnv_MBCSGetName,
5643    ucnv_MBCSWriteSub,
5644    NULL,
5645    ucnv_MBCSGetUnicodeSet
5646};
5647
5648
5649/* Static data is in tools/makeconv/ucnvstat.c for data-based
5650 * converters. Be sure to update it as well.
5651 */
5652
5653const UConverterSharedData _MBCSData={
5654    sizeof(UConverterSharedData), 1,
5655    NULL, NULL, NULL, FALSE, &_MBCSImpl,
5656    0
5657};
5658
5659#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
5660