1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6*   Copyright (C) 2003-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9******************************************************************************
10*   file name:  ucnv_ext.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2003jun13
16*   created by: Markus W. Scherer
17*
18*   Conversion extensions
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
24
25#include "unicode/uset.h"
26#include "unicode/ustring.h"
27#include "ucnv_bld.h"
28#include "ucnv_cnv.h"
29#include "ucnv_ext.h"
30#include "cmemory.h"
31#include "uassert.h"
32
33/* to Unicode --------------------------------------------------------------- */
34
35/*
36 * @return lookup value for the byte, if found; else 0
37 */
38static inline uint32_t
39ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
40    uint32_t word0, word;
41    int32_t i, start, limit;
42
43    /* check the input byte against the lowest and highest section bytes */
44    start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
45    limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
46    if(byte<start || limit<byte) {
47        return 0; /* the byte is out of range */
48    }
49
50    if(length==((limit-start)+1)) {
51        /* direct access on a linear array */
52        return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
53    }
54
55    /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
56    word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
57
58    /*
59     * Shift byte once instead of each section word and add 0xffffff.
60     * We will compare the shifted/added byte (bbffffff) against
61     * section words which have byte values in the same bit position.
62     * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
63     * for all v=0..f
64     * so we need not mask off the lower 24 bits of each section word.
65     */
66    word=word0|UCNV_EXT_TO_U_VALUE_MASK;
67
68    /* binary search */
69    start=0;
70    limit=length;
71    for(;;) {
72        i=limit-start;
73        if(i<=1) {
74            break; /* done */
75        }
76        /* start<limit-1 */
77
78        if(i<=4) {
79            /* linear search for the last part */
80            if(word0<=toUSection[start]) {
81                break;
82            }
83            if(++start<limit && word0<=toUSection[start]) {
84                break;
85            }
86            if(++start<limit && word0<=toUSection[start]) {
87                break;
88            }
89            /* always break at start==limit-1 */
90            ++start;
91            break;
92        }
93
94        i=(start+limit)/2;
95        if(word<toUSection[i]) {
96            limit=i;
97        } else {
98            start=i;
99        }
100    }
101
102    /* did we really find it? */
103    if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
104        return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
105    } else {
106        return 0; /* not found */
107    }
108}
109
110/*
111 * TRUE if not an SI/SO stateful converter,
112 * or if the match length fits with the current converter state
113 */
114#define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
115    ((sisoState)<0 || ((sisoState)==0) == (match==1))
116
117/*
118 * this works like ucnv_extMatchFromU() except
119 * - the first character is in pre
120 * - no trie is used
121 * - the returned matchLength is not offset by 2
122 */
123static int32_t
124ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
125                 const char *pre, int32_t preLength,
126                 const char *src, int32_t srcLength,
127                 uint32_t *pMatchValue,
128                 UBool /*useFallback*/, UBool flush) {
129    const uint32_t *toUTable, *toUSection;
130
131    uint32_t value, matchValue;
132    int32_t i, j, idx, length, matchLength;
133    uint8_t b;
134
135    if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
136        return 0; /* no extension data, no match */
137    }
138
139    /* initialize */
140    toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
141    idx=0;
142
143    matchValue=0;
144    i=j=matchLength=0;
145
146    if(sisoState==0) {
147        /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
148        if(preLength>1) {
149            return 0; /* no match of a DBCS sequence in SBCS mode */
150        } else if(preLength==1) {
151            srcLength=0;
152        } else /* preLength==0 */ {
153            if(srcLength>1) {
154                srcLength=1;
155            }
156        }
157        flush=TRUE;
158    }
159
160    /* we must not remember fallback matches when not using fallbacks */
161
162    /* match input units until there is a full match or the input is consumed */
163    for(;;) {
164        /* go to the next section */
165        toUSection=toUTable+idx;
166
167        /* read first pair of the section */
168        value=*toUSection++;
169        length=UCNV_EXT_TO_U_GET_BYTE(value);
170        value=UCNV_EXT_TO_U_GET_VALUE(value);
171        if( value!=0 &&
172            (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
173             TO_U_USE_FALLBACK(useFallback)) &&
174            UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
175        ) {
176            /* remember longest match so far */
177            matchValue=value;
178            matchLength=i+j;
179        }
180
181        /* match pre[] then src[] */
182        if(i<preLength) {
183            b=(uint8_t)pre[i++];
184        } else if(j<srcLength) {
185            b=(uint8_t)src[j++];
186        } else {
187            /* all input consumed, partial match */
188            if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
189                /*
190                 * end of the entire input stream, stop with the longest match so far
191                 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
192                 * because it must fit into state buffers
193                 */
194                break;
195            } else {
196                /* continue with more input next time */
197                return -length;
198            }
199        }
200
201        /* search for the current UChar */
202        value=ucnv_extFindToU(toUSection, length, b);
203        if(value==0) {
204            /* no match here, stop with the longest match so far */
205            break;
206        } else {
207            if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
208                /* partial match, continue */
209                idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
210            } else {
211                if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
212                     TO_U_USE_FALLBACK(useFallback)) &&
213                    UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
214                ) {
215                    /* full match, stop with result */
216                    matchValue=value;
217                    matchLength=i+j;
218                } else {
219                    /* full match on fallback not taken, stop with the longest match so far */
220                }
221                break;
222            }
223        }
224    }
225
226    if(matchLength==0) {
227        /* no match at all */
228        return 0;
229    }
230
231    /* return result */
232    *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
233    return matchLength;
234}
235
236static inline void
237ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
238                 uint32_t value,
239                 UChar **target, const UChar *targetLimit,
240                 int32_t **offsets, int32_t srcIndex,
241                 UErrorCode *pErrorCode) {
242    /* output the result */
243    if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
244        /* output a single code point */
245        ucnv_toUWriteCodePoint(
246            cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
247            target, targetLimit,
248            offsets, srcIndex,
249            pErrorCode);
250    } else {
251        /* output a string - with correct data we have resultLength>0 */
252        ucnv_toUWriteUChars(
253            cnv,
254            UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
255                UCNV_EXT_TO_U_GET_INDEX(value),
256            UCNV_EXT_TO_U_GET_LENGTH(value),
257            target, targetLimit,
258            offsets, srcIndex,
259            pErrorCode);
260    }
261}
262
263/*
264 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
265 * or 1 for DBCS-only,
266 * or -1 if the converter is not SI/SO stateful
267 *
268 * Note: For SI/SO stateful converters getting here,
269 * cnv->mode==0 is equivalent to firstLength==1.
270 */
271#define UCNV_SISO_STATE(cnv) \
272    ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
273     (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
274
275/*
276 * target<targetLimit; set error code for overflow
277 */
278U_CFUNC UBool
279ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
280                        int32_t firstLength,
281                        const char **src, const char *srcLimit,
282                        UChar **target, const UChar *targetLimit,
283                        int32_t **offsets, int32_t srcIndex,
284                        UBool flush,
285                        UErrorCode *pErrorCode) {
286    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
287    int32_t match;
288
289    /* try to match */
290    match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
291                           (const char *)cnv->toUBytes, firstLength,
292                           *src, (int32_t)(srcLimit-*src),
293                           &value,
294                           cnv->useFallback, flush);
295    if(match>0) {
296        /* advance src pointer for the consumed input */
297        *src+=match-firstLength;
298
299        /* write result to target */
300        ucnv_extWriteToU(cnv, cx,
301                         value,
302                         target, targetLimit,
303                         offsets, srcIndex,
304                         pErrorCode);
305        return TRUE;
306    } else if(match<0) {
307        /* save state for partial match */
308        const char *s;
309        int32_t j;
310
311        /* copy the first code point */
312        s=(const char *)cnv->toUBytes;
313        cnv->preToUFirstLength=(int8_t)firstLength;
314        for(j=0; j<firstLength; ++j) {
315            cnv->preToU[j]=*s++;
316        }
317
318        /* now copy the newly consumed input */
319        s=*src;
320        match=-match;
321        for(; j<match; ++j) {
322            cnv->preToU[j]=*s++;
323        }
324        *src=s; /* same as *src=srcLimit; because we reached the end of input */
325        cnv->preToULength=(int8_t)match;
326        return TRUE;
327    } else /* match==0 no match */ {
328        return FALSE;
329    }
330}
331
332U_CFUNC UChar32
333ucnv_extSimpleMatchToU(const int32_t *cx,
334                       const char *source, int32_t length,
335                       UBool useFallback) {
336    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
337    int32_t match;
338
339    if(length<=0) {
340        return 0xffff;
341    }
342
343    /* try to match */
344    match=ucnv_extMatchToU(cx, -1,
345                           source, length,
346                           NULL, 0,
347                           &value,
348                           useFallback, TRUE);
349    if(match==length) {
350        /* write result for simple, single-character conversion */
351        if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
352            return UCNV_EXT_TO_U_GET_CODE_POINT(value);
353        }
354    }
355
356    /*
357     * return no match because
358     * - match>0 && value points to string: simple conversion cannot handle multiple code points
359     * - match>0 && match!=length: not all input consumed, forbidden for this function
360     * - match==0: no match found in the first place
361     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
362     */
363    return 0xfffe;
364}
365
366/*
367 * continue partial match with new input
368 * never called for simple, single-character conversion
369 */
370U_CFUNC void
371ucnv_extContinueMatchToU(UConverter *cnv,
372                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
373                         UErrorCode *pErrorCode) {
374    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
375    int32_t match, length;
376
377    match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
378                           cnv->preToU, cnv->preToULength,
379                           pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
380                           &value,
381                           cnv->useFallback, pArgs->flush);
382    if(match>0) {
383        if(match>=cnv->preToULength) {
384            /* advance src pointer for the consumed input */
385            pArgs->source+=match-cnv->preToULength;
386            cnv->preToULength=0;
387        } else {
388            /* the match did not use all of preToU[] - keep the rest for replay */
389            length=cnv->preToULength-match;
390            uprv_memmove(cnv->preToU, cnv->preToU+match, length);
391            cnv->preToULength=(int8_t)-length;
392        }
393
394        /* write result */
395        ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
396                         value,
397                         &pArgs->target, pArgs->targetLimit,
398                         &pArgs->offsets, srcIndex,
399                         pErrorCode);
400    } else if(match<0) {
401        /* save state for partial match */
402        const char *s;
403        int32_t j;
404
405        /* just _append_ the newly consumed input to preToU[] */
406        s=pArgs->source;
407        match=-match;
408        for(j=cnv->preToULength; j<match; ++j) {
409            cnv->preToU[j]=*s++;
410        }
411        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
412        cnv->preToULength=(int8_t)match;
413    } else /* match==0 */ {
414        /*
415         * no match
416         *
417         * We need to split the previous input into two parts:
418         *
419         * 1. The first codepage character is unmappable - that's how we got into
420         *    trying the extension data in the first place.
421         *    We need to move it from the preToU buffer
422         *    to the error buffer, set an error code,
423         *    and prepare the rest of the previous input for 2.
424         *
425         * 2. The rest of the previous input must be converted once we
426         *    come back from the callback for the first character.
427         *    At that time, we have to try again from scratch to convert
428         *    these input characters.
429         *    The replay will be handled by the ucnv.c conversion code.
430         */
431
432        /* move the first codepage character to the error field */
433        uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
434        cnv->toULength=cnv->preToUFirstLength;
435
436        /* move the rest up inside the buffer */
437        length=cnv->preToULength-cnv->preToUFirstLength;
438        if(length>0) {
439            uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
440        }
441
442        /* mark preToU for replay */
443        cnv->preToULength=(int8_t)-length;
444
445        /* set the error code for unassigned */
446        *pErrorCode=U_INVALID_CHAR_FOUND;
447    }
448}
449
450/* from Unicode ------------------------------------------------------------- */
451
452// Use roundtrips, "good one-way" mappings, and some normal fallbacks.
453static inline UBool
454extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
455    return
456        ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
457            FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
458        (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
459}
460
461/*
462 * @return index of the UChar, if found; else <0
463 */
464static inline int32_t
465ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
466    int32_t i, start, limit;
467
468    /* binary search */
469    start=0;
470    limit=length;
471    for(;;) {
472        i=limit-start;
473        if(i<=1) {
474            break; /* done */
475        }
476        /* start<limit-1 */
477
478        if(i<=4) {
479            /* linear search for the last part */
480            if(u<=fromUSection[start]) {
481                break;
482            }
483            if(++start<limit && u<=fromUSection[start]) {
484                break;
485            }
486            if(++start<limit && u<=fromUSection[start]) {
487                break;
488            }
489            /* always break at start==limit-1 */
490            ++start;
491            break;
492        }
493
494        i=(start+limit)/2;
495        if(u<fromUSection[i]) {
496            limit=i;
497        } else {
498            start=i;
499        }
500    }
501
502    /* did we really find it? */
503    if(start<limit && u==fromUSection[start]) {
504        return start;
505    } else {
506        return -1; /* not found */
507    }
508}
509
510/*
511 * @param cx pointer to extension data; if NULL, returns 0
512 * @param firstCP the first code point before all the other UChars
513 * @param pre UChars that must match; !initialMatch: partial match with them
514 * @param preLength length of pre, >=0
515 * @param src UChars that can be used to complete a match
516 * @param srcLength length of src, >=0
517 * @param pMatchValue [out] output result value for the match from the data structure
518 * @param useFallback "use fallback" flag, usually from cnv->useFallback
519 * @param flush TRUE if the end of the input stream is reached
520 * @return >1: matched, return value=total match length (number of input units matched)
521 *          1: matched, no mapping but request for <subchar1>
522 *             (only for the first code point)
523 *          0: no match
524 *         <0: partial match, return value=negative total match length
525 *             (partial matches are never returned for flush==TRUE)
526 *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
527 *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
528 *         further code units matched
529 */
530static int32_t
531ucnv_extMatchFromU(const int32_t *cx,
532                   UChar32 firstCP,
533                   const UChar *pre, int32_t preLength,
534                   const UChar *src, int32_t srcLength,
535                   uint32_t *pMatchValue,
536                   UBool useFallback, UBool flush) {
537    const uint16_t *stage12, *stage3;
538    const uint32_t *stage3b;
539
540    const UChar *fromUTableUChars, *fromUSectionUChars;
541    const uint32_t *fromUTableValues, *fromUSectionValues;
542
543    uint32_t value, matchValue;
544    int32_t i, j, idx, length, matchLength;
545    UChar c;
546
547    if(cx==NULL) {
548        return 0; /* no extension data, no match */
549    }
550
551    /* trie lookup of firstCP */
552    idx=firstCP>>10; /* stage 1 index */
553    if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
554        return 0; /* the first code point is outside the trie */
555    }
556
557    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
558    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
559    idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
560
561    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
562    value=stage3b[idx];
563    if(value==0) {
564        return 0;
565    }
566
567    /*
568     * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
569     * Do not interpret values with reserved bits used, for forward compatibility,
570     * and do not even remember intermediate results with reserved bits used.
571     */
572
573    if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
574        /* partial match, enter the loop below */
575        idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
576
577        /* initialize */
578        fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
579        fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
580
581        matchValue=0;
582        i=j=matchLength=0;
583
584        /* we must not remember fallback matches when not using fallbacks */
585
586        /* match input units until there is a full match or the input is consumed */
587        for(;;) {
588            /* go to the next section */
589            fromUSectionUChars=fromUTableUChars+idx;
590            fromUSectionValues=fromUTableValues+idx;
591
592            /* read first pair of the section */
593            length=*fromUSectionUChars++;
594            value=*fromUSectionValues++;
595            if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
596                /* remember longest match so far */
597                matchValue=value;
598                matchLength=2+i+j;
599            }
600
601            /* match pre[] then src[] */
602            if(i<preLength) {
603                c=pre[i++];
604            } else if(j<srcLength) {
605                c=src[j++];
606            } else {
607                /* all input consumed, partial match */
608                if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
609                    /*
610                     * end of the entire input stream, stop with the longest match so far
611                     * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
612                     * because it must fit into state buffers
613                     */
614                    break;
615                } else {
616                    /* continue with more input next time */
617                    return -(2+length);
618                }
619            }
620
621            /* search for the current UChar */
622            idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
623            if(idx<0) {
624                /* no match here, stop with the longest match so far */
625                break;
626            } else {
627                value=fromUSectionValues[idx];
628                if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
629                    /* partial match, continue */
630                    idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
631                } else {
632                    if(extFromUUseMapping(useFallback, value, firstCP)) {
633                        /* full match, stop with result */
634                        matchValue=value;
635                        matchLength=2+i+j;
636                    } else {
637                        /* full match on fallback not taken, stop with the longest match so far */
638                    }
639                    break;
640                }
641            }
642        }
643
644        if(matchLength==0) {
645            /* no match at all */
646            return 0;
647        }
648    } else /* result from firstCP trie lookup */ {
649        if(extFromUUseMapping(useFallback, value, firstCP)) {
650            /* full match, stop with result */
651            matchValue=value;
652            matchLength=2;
653        } else {
654            /* fallback not taken */
655            return 0;
656        }
657    }
658
659    /* return result */
660    if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
661        return 1; /* assert matchLength==2 */
662    }
663
664    *pMatchValue=matchValue;
665    return matchLength;
666}
667
668/*
669 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
670 */
671static inline void
672ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
673                   uint32_t value,
674                   char **target, const char *targetLimit,
675                   int32_t **offsets, int32_t srcIndex,
676                   UErrorCode *pErrorCode) {
677    uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
678    const uint8_t *result;
679    int32_t length, prevLength;
680
681    length=UCNV_EXT_FROM_U_GET_LENGTH(value);
682    value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
683
684    /* output the result */
685    if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
686        /*
687         * Generate a byte array and then write it below.
688         * This is not the fastest possible way, but it should be ok for
689         * extension mappings, and it is much simpler.
690         * Offset and overflow handling are only done once this way.
691         */
692        uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
693        switch(length) {
694        case 3:
695            *p++=(uint8_t)(value>>16);
696            U_FALLTHROUGH;
697        case 2:
698            *p++=(uint8_t)(value>>8);
699            U_FALLTHROUGH;
700        case 1:
701            *p++=(uint8_t)value;
702            U_FALLTHROUGH;
703        default:
704            break; /* will never occur */
705        }
706        result=buffer+1;
707    } else {
708        result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
709    }
710
711    /* with correct data we have length>0 */
712
713    if((prevLength=cnv->fromUnicodeStatus)!=0) {
714        /* handle SI/SO stateful output */
715        uint8_t shiftByte;
716
717        if(prevLength>1 && length==1) {
718            /* change from double-byte mode to single-byte */
719            shiftByte=(uint8_t)UCNV_SI;
720            cnv->fromUnicodeStatus=1;
721        } else if(prevLength==1 && length>1) {
722            /* change from single-byte mode to double-byte */
723            shiftByte=(uint8_t)UCNV_SO;
724            cnv->fromUnicodeStatus=2;
725        } else {
726            shiftByte=0;
727        }
728
729        if(shiftByte!=0) {
730            /* prepend the shift byte to the result bytes */
731            buffer[0]=shiftByte;
732            if(result!=buffer+1) {
733                uprv_memcpy(buffer+1, result, length);
734            }
735            result=buffer;
736            ++length;
737        }
738    }
739
740    ucnv_fromUWriteBytes(cnv, (const char *)result, length,
741                         target, targetLimit,
742                         offsets, srcIndex,
743                         pErrorCode);
744}
745
746/*
747 * target<targetLimit; set error code for overflow
748 */
749U_CFUNC UBool
750ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
751                          UChar32 cp,
752                          const UChar **src, const UChar *srcLimit,
753                          char **target, const char *targetLimit,
754                          int32_t **offsets, int32_t srcIndex,
755                          UBool flush,
756                          UErrorCode *pErrorCode) {
757    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
758    int32_t match;
759
760    /* try to match */
761    match=ucnv_extMatchFromU(cx, cp,
762                             NULL, 0,
763                             *src, (int32_t)(srcLimit-*src),
764                             &value,
765                             cnv->useFallback, flush);
766
767    /* reject a match if the result is a single byte for DBCS-only */
768    if( match>=2 &&
769        !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
770          cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
771    ) {
772        /* advance src pointer for the consumed input */
773        *src+=match-2; /* remove 2 for the initial code point */
774
775        /* write result to target */
776        ucnv_extWriteFromU(cnv, cx,
777                           value,
778                           target, targetLimit,
779                           offsets, srcIndex,
780                           pErrorCode);
781        return TRUE;
782    } else if(match<0) {
783        /* save state for partial match */
784        const UChar *s;
785        int32_t j;
786
787        /* copy the first code point */
788        cnv->preFromUFirstCP=cp;
789
790        /* now copy the newly consumed input */
791        s=*src;
792        match=-match-2; /* remove 2 for the initial code point */
793        for(j=0; j<match; ++j) {
794            cnv->preFromU[j]=*s++;
795        }
796        *src=s; /* same as *src=srcLimit; because we reached the end of input */
797        cnv->preFromULength=(int8_t)match;
798        return TRUE;
799    } else if(match==1) {
800        /* matched, no mapping but request for <subchar1> */
801        cnv->useSubChar1=TRUE;
802        return FALSE;
803    } else /* match==0 no match */ {
804        return FALSE;
805    }
806}
807
808/*
809 * Used by ISO 2022 implementation.
810 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
811 */
812U_CFUNC int32_t
813ucnv_extSimpleMatchFromU(const int32_t *cx,
814                         UChar32 cp, uint32_t *pValue,
815                         UBool useFallback) {
816    uint32_t value;
817    int32_t match;
818
819    /* try to match */
820    match=ucnv_extMatchFromU(cx,
821                             cp,
822                             NULL, 0,
823                             NULL, 0,
824                             &value,
825                             useFallback, TRUE);
826    if(match>=2) {
827        /* write result for simple, single-character conversion */
828        int32_t length;
829        int isRoundtrip;
830
831        isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
832        length=UCNV_EXT_FROM_U_GET_LENGTH(value);
833        value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
834
835        if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
836            *pValue=value;
837            return isRoundtrip ? length : -length;
838#if 0 /* not currently used */
839        } else if(length==4) {
840            /* de-serialize a 4-byte result */
841            const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
842            *pValue=
843                ((uint32_t)result[0]<<24)|
844                ((uint32_t)result[1]<<16)|
845                ((uint32_t)result[2]<<8)|
846                result[3];
847            return isRoundtrip ? 4 : -4;
848#endif
849        }
850    }
851
852    /*
853     * return no match because
854     * - match>1 && resultLength>4: result too long for simple conversion
855     * - match==1: no match found, <subchar1> preferred
856     * - match==0: no match found in the first place
857     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
858     */
859    return 0;
860}
861
862/*
863 * continue partial match with new input, requires cnv->preFromUFirstCP>=0
864 * never called for simple, single-character conversion
865 */
866U_CFUNC void
867ucnv_extContinueMatchFromU(UConverter *cnv,
868                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
869                           UErrorCode *pErrorCode) {
870    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
871    int32_t match;
872
873    match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
874                             cnv->preFromUFirstCP,
875                             cnv->preFromU, cnv->preFromULength,
876                             pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
877                             &value,
878                             cnv->useFallback, pArgs->flush);
879    if(match>=2) {
880        match-=2; /* remove 2 for the initial code point */
881
882        if(match>=cnv->preFromULength) {
883            /* advance src pointer for the consumed input */
884            pArgs->source+=match-cnv->preFromULength;
885            cnv->preFromULength=0;
886        } else {
887            /* the match did not use all of preFromU[] - keep the rest for replay */
888            int32_t length=cnv->preFromULength-match;
889            u_memmove(cnv->preFromU, cnv->preFromU+match, length);
890            cnv->preFromULength=(int8_t)-length;
891        }
892
893        /* finish the partial match */
894        cnv->preFromUFirstCP=U_SENTINEL;
895
896        /* write result */
897        ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
898                           value,
899                           &pArgs->target, pArgs->targetLimit,
900                           &pArgs->offsets, srcIndex,
901                           pErrorCode);
902    } else if(match<0) {
903        /* save state for partial match */
904        const UChar *s;
905        int32_t j;
906
907        /* just _append_ the newly consumed input to preFromU[] */
908        s=pArgs->source;
909        match=-match-2; /* remove 2 for the initial code point */
910        for(j=cnv->preFromULength; j<match; ++j) {
911            U_ASSERT(j>=0);
912            cnv->preFromU[j]=*s++;
913        }
914        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
915        cnv->preFromULength=(int8_t)match;
916    } else /* match==0 or 1 */ {
917        /*
918         * no match
919         *
920         * We need to split the previous input into two parts:
921         *
922         * 1. The first code point is unmappable - that's how we got into
923         *    trying the extension data in the first place.
924         *    We need to move it from the preFromU buffer
925         *    to the error buffer, set an error code,
926         *    and prepare the rest of the previous input for 2.
927         *
928         * 2. The rest of the previous input must be converted once we
929         *    come back from the callback for the first code point.
930         *    At that time, we have to try again from scratch to convert
931         *    these input characters.
932         *    The replay will be handled by the ucnv.c conversion code.
933         */
934
935        if(match==1) {
936            /* matched, no mapping but request for <subchar1> */
937            cnv->useSubChar1=TRUE;
938        }
939
940        /* move the first code point to the error field */
941        cnv->fromUChar32=cnv->preFromUFirstCP;
942        cnv->preFromUFirstCP=U_SENTINEL;
943
944        /* mark preFromU for replay */
945        cnv->preFromULength=-cnv->preFromULength;
946
947        /* set the error code for unassigned */
948        *pErrorCode=U_INVALID_CHAR_FOUND;
949    }
950}
951
952static UBool
953extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
954    if(which==UCNV_ROUNDTRIP_SET) {
955        // Add only code points for which the roundtrip flag is set.
956        // Do not add any fallbacks, even if ucnv_fromUnicode() would use them
957        // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
958        //
959        // By analogy, also do not add "good one-way" mappings.
960        //
961        // Do not add entries with reserved bits set.
962        if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
963                UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
964            return FALSE;
965        }
966    } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
967        // Do not add entries with reserved bits set.
968        if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
969            return FALSE;
970        }
971    }
972    // Do not add <subchar1> entries or other (future?) pseudo-entries
973    // with an output length of 0.
974    return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
975}
976
977static void
978ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
979                            const int32_t *cx,
980                            const USetAdder *sa,
981                            UConverterUnicodeSet which,
982                            int32_t minLength,
983                            UChar32 firstCP,
984                            UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
985                            int32_t sectionIndex,
986                            UErrorCode *pErrorCode) {
987    const UChar *fromUSectionUChars;
988    const uint32_t *fromUSectionValues;
989
990    uint32_t value;
991    int32_t i, count;
992
993    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
994    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
995
996    /* read first pair of the section */
997    count=*fromUSectionUChars++;
998    value=*fromUSectionValues++;
999
1000    if(extSetUseMapping(which, minLength, value)) {
1001        if(length==U16_LENGTH(firstCP)) {
1002            /* add the initial code point */
1003            sa->add(sa->set, firstCP);
1004        } else {
1005            /* add the string so far */
1006            sa->addString(sa->set, s, length);
1007        }
1008    }
1009
1010    for(i=0; i<count; ++i) {
1011        /* append this code unit and recurse or add the string */
1012        s[length]=fromUSectionUChars[i];
1013        value=fromUSectionValues[i];
1014
1015        if(value==0) {
1016            /* no mapping, do nothing */
1017        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
1018            ucnv_extGetUnicodeSetString(
1019                sharedData, cx, sa, which, minLength,
1020                firstCP, s, length+1,
1021                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
1022                pErrorCode);
1023        } else if(extSetUseMapping(which, minLength, value)) {
1024            sa->addString(sa->set, s, length+1);
1025        }
1026    }
1027}
1028
1029U_CFUNC void
1030ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
1031                      const USetAdder *sa,
1032                      UConverterUnicodeSet which,
1033                      UConverterSetFilter filter,
1034                      UErrorCode *pErrorCode) {
1035    const int32_t *cx;
1036    const uint16_t *stage12, *stage3, *ps2, *ps3;
1037    const uint32_t *stage3b;
1038
1039    uint32_t value;
1040    int32_t st1, stage1Length, st2, st3, minLength;
1041
1042    UChar s[UCNV_EXT_MAX_UCHARS];
1043    UChar32 c;
1044    int32_t length;
1045
1046    cx=sharedData->mbcs.extIndexes;
1047    if(cx==NULL) {
1048        return;
1049    }
1050
1051    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
1052    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
1053    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
1054
1055    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
1056
1057    /* enumerate the from-Unicode trie table */
1058    c=0; /* keep track of the current code point while enumerating */
1059
1060    if(filter==UCNV_SET_FILTER_2022_CN) {
1061        minLength=3;
1062    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
1063               filter!=UCNV_SET_FILTER_NONE
1064    ) {
1065        /* DBCS-only, ignore single-byte results */
1066        minLength=2;
1067    } else {
1068        minLength=1;
1069    }
1070
1071    /*
1072     * the trie enumeration is almost the same as
1073     * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
1074     */
1075    for(st1=0; st1<stage1Length; ++st1) {
1076        st2=stage12[st1];
1077        if(st2>stage1Length) {
1078            ps2=stage12+st2;
1079            for(st2=0; st2<64; ++st2) {
1080                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
1081                    /* read the stage 3 block */
1082                    ps3=stage3+st3;
1083
1084                    do {
1085                        value=stage3b[*ps3++];
1086                        if(value==0) {
1087                            /* no mapping, do nothing */
1088                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
1089                            // Recurse for partial results.
1090                            length=0;
1091                            U16_APPEND_UNSAFE(s, length, c);
1092                            ucnv_extGetUnicodeSetString(
1093                                sharedData, cx, sa, which, minLength,
1094                                c, s, length,
1095                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
1096                                pErrorCode);
1097                        } else if(extSetUseMapping(which, minLength, value)) {
1098                            switch(filter) {
1099                            case UCNV_SET_FILTER_2022_CN:
1100                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
1101                                    continue;
1102                                }
1103                                break;
1104                            case UCNV_SET_FILTER_SJIS:
1105                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
1106                                    continue;
1107                                }
1108                                break;
1109                            case UCNV_SET_FILTER_GR94DBCS:
1110                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
1111                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
1112                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
1113                                    continue;
1114                                }
1115                                break;
1116                            case UCNV_SET_FILTER_HZ:
1117                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
1118                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
1119                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
1120                                    continue;
1121                                }
1122                                break;
1123                            default:
1124                                /*
1125                                 * UCNV_SET_FILTER_NONE,
1126                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
1127                                 */
1128                                break;
1129                            }
1130                            sa->add(sa->set, c);
1131                        }
1132                    } while((++c&0xf)!=0);
1133                } else {
1134                    c+=16; /* empty stage 3 block */
1135                }
1136            }
1137        } else {
1138            c+=1024; /* empty stage 2 block */
1139        }
1140    }
1141}
1142
1143#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1144