1/*
2******************************************************************************
3*
4*   Copyright (C) 2003-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ucnv_ext.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003jun13
14*   created by: Markus W. Scherer
15*
16*   Conversion extensions
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
22
23#include "unicode/uset.h"
24#include "ucnv_bld.h"
25#include "ucnv_cnv.h"
26#include "ucnv_ext.h"
27#include "cmemory.h"
28
29/* to Unicode --------------------------------------------------------------- */
30
31/*
32 * @return lookup value for the byte, if found; else 0
33 */
34static U_INLINE uint32_t
35ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
36    uint32_t word0, word;
37    int32_t i, start, limit;
38
39    /* check the input byte against the lowest and highest section bytes */
40    start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
41    limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
42    if(byte<start || limit<byte) {
43        return 0; /* the byte is out of range */
44    }
45
46    if(length==((limit-start)+1)) {
47        /* direct access on a linear array */
48        return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
49    }
50
51    /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
52    word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
53
54    /*
55     * Shift byte once instead of each section word and add 0xffffff.
56     * We will compare the shifted/added byte (bbffffff) against
57     * section words which have byte values in the same bit position.
58     * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
59     * for all v=0..f
60     * so we need not mask off the lower 24 bits of each section word.
61     */
62    word=word0|UCNV_EXT_TO_U_VALUE_MASK;
63
64    /* binary search */
65    start=0;
66    limit=length;
67    for(;;) {
68        i=limit-start;
69        if(i<=1) {
70            break; /* done */
71        }
72        /* start<limit-1 */
73
74        if(i<=4) {
75            /* linear search for the last part */
76            if(word0<=toUSection[start]) {
77                break;
78            }
79            if(++start<limit && word0<=toUSection[start]) {
80                break;
81            }
82            if(++start<limit && word0<=toUSection[start]) {
83                break;
84            }
85            /* always break at start==limit-1 */
86            ++start;
87            break;
88        }
89
90        i=(start+limit)/2;
91        if(word<toUSection[i]) {
92            limit=i;
93        } else {
94            start=i;
95        }
96    }
97
98    /* did we really find it? */
99    if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
100        return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
101    } else {
102        return 0; /* not found */
103    }
104}
105
106/*
107 * TRUE if not an SI/SO stateful converter,
108 * or if the match length fits with the current converter state
109 */
110#define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
111    ((sisoState)<0 || ((sisoState)==0) == (match==1))
112
113/*
114 * this works like ucnv_extMatchFromU() except
115 * - the first character is in pre
116 * - no trie is used
117 * - the returned matchLength is not offset by 2
118 */
119static int32_t
120ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
121                 const char *pre, int32_t preLength,
122                 const char *src, int32_t srcLength,
123                 uint32_t *pMatchValue,
124                 UBool useFallback, UBool flush) {
125    const uint32_t *toUTable, *toUSection;
126
127    uint32_t value, matchValue;
128    int32_t i, j, idx, length, matchLength;
129    uint8_t b;
130
131    if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
132        return 0; /* no extension data, no match */
133    }
134
135    /* initialize */
136    toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
137    idx=0;
138
139    matchValue=0;
140    i=j=matchLength=0;
141
142    if(sisoState==0) {
143        /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
144        if(preLength>1) {
145            return 0; /* no match of a DBCS sequence in SBCS mode */
146        } else if(preLength==1) {
147            srcLength=0;
148        } else /* preLength==0 */ {
149            if(srcLength>1) {
150                srcLength=1;
151            }
152        }
153        flush=TRUE;
154    }
155
156    /* we must not remember fallback matches when not using fallbacks */
157
158    /* match input units until there is a full match or the input is consumed */
159    for(;;) {
160        /* go to the next section */
161        toUSection=toUTable+idx;
162
163        /* read first pair of the section */
164        value=*toUSection++;
165        length=UCNV_EXT_TO_U_GET_BYTE(value);
166        value=UCNV_EXT_TO_U_GET_VALUE(value);
167        if( value!=0 &&
168            (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
169             TO_U_USE_FALLBACK(useFallback)) &&
170            UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
171        ) {
172            /* remember longest match so far */
173            matchValue=value;
174            matchLength=i+j;
175        }
176
177        /* match pre[] then src[] */
178        if(i<preLength) {
179            b=(uint8_t)pre[i++];
180        } else if(j<srcLength) {
181            b=(uint8_t)src[j++];
182        } else {
183            /* all input consumed, partial match */
184            if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
185                /*
186                 * end of the entire input stream, stop with the longest match so far
187                 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
188                 * because it must fit into state buffers
189                 */
190                break;
191            } else {
192                /* continue with more input next time */
193                return -length;
194            }
195        }
196
197        /* search for the current UChar */
198        value=ucnv_extFindToU(toUSection, length, b);
199        if(value==0) {
200            /* no match here, stop with the longest match so far */
201            break;
202        } else {
203            if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
204                /* partial match, continue */
205                idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
206            } else {
207                if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
208                     TO_U_USE_FALLBACK(useFallback)) &&
209                    UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
210                ) {
211                    /* full match, stop with result */
212                    matchValue=value;
213                    matchLength=i+j;
214                } else {
215                    /* full match on fallback not taken, stop with the longest match so far */
216                }
217                break;
218            }
219        }
220    }
221
222    if(matchLength==0) {
223        /* no match at all */
224        return 0;
225    }
226
227    /* return result */
228    *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
229    return matchLength;
230}
231
232static U_INLINE void
233ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
234                 uint32_t value,
235                 UChar **target, const UChar *targetLimit,
236                 int32_t **offsets, int32_t srcIndex,
237                 UErrorCode *pErrorCode) {
238    /* output the result */
239    if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
240        /* output a single code point */
241        ucnv_toUWriteCodePoint(
242            cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
243            target, targetLimit,
244            offsets, srcIndex,
245            pErrorCode);
246    } else {
247        /* output a string - with correct data we have resultLength>0 */
248        ucnv_toUWriteUChars(
249            cnv,
250            UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
251                UCNV_EXT_TO_U_GET_INDEX(value),
252            UCNV_EXT_TO_U_GET_LENGTH(value),
253            target, targetLimit,
254            offsets, srcIndex,
255            pErrorCode);
256    }
257}
258
259/*
260 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
261 * or 1 for DBCS-only,
262 * or -1 if the converter is not SI/SO stateful
263 *
264 * Note: For SI/SO stateful converters getting here,
265 * cnv->mode==0 is equivalent to firstLength==1.
266 */
267#define UCNV_SISO_STATE(cnv) \
268    ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
269     (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
270
271/*
272 * target<targetLimit; set error code for overflow
273 */
274U_CFUNC UBool
275ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
276                        int32_t firstLength,
277                        const char **src, const char *srcLimit,
278                        UChar **target, const UChar *targetLimit,
279                        int32_t **offsets, int32_t srcIndex,
280                        UBool flush,
281                        UErrorCode *pErrorCode) {
282    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
283    int32_t match;
284
285    /* try to match */
286    match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
287                           (const char *)cnv->toUBytes, firstLength,
288                           *src, (int32_t)(srcLimit-*src),
289                           &value,
290                           cnv->useFallback, flush);
291    if(match>0) {
292        /* advance src pointer for the consumed input */
293        *src+=match-firstLength;
294
295        /* write result to target */
296        ucnv_extWriteToU(cnv, cx,
297                         value,
298                         target, targetLimit,
299                         offsets, srcIndex,
300                         pErrorCode);
301        return TRUE;
302    } else if(match<0) {
303        /* save state for partial match */
304        const char *s;
305        int32_t j;
306
307        /* copy the first code point */
308        s=(const char *)cnv->toUBytes;
309        cnv->preToUFirstLength=(int8_t)firstLength;
310        for(j=0; j<firstLength; ++j) {
311            cnv->preToU[j]=*s++;
312        }
313
314        /* now copy the newly consumed input */
315        s=*src;
316        match=-match;
317        for(; j<match; ++j) {
318            cnv->preToU[j]=*s++;
319        }
320        *src=s; /* same as *src=srcLimit; because we reached the end of input */
321        cnv->preToULength=(int8_t)match;
322        return TRUE;
323    } else /* match==0 no match */ {
324        return FALSE;
325    }
326}
327
328U_CFUNC UChar32
329ucnv_extSimpleMatchToU(const int32_t *cx,
330                       const char *source, int32_t length,
331                       UBool useFallback) {
332    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
333    int32_t match;
334
335    if(length<=0) {
336        return 0xffff;
337    }
338
339    /* try to match */
340    match=ucnv_extMatchToU(cx, -1,
341                           source, length,
342                           NULL, 0,
343                           &value,
344                           useFallback, TRUE);
345    if(match==length) {
346        /* write result for simple, single-character conversion */
347        if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
348            return UCNV_EXT_TO_U_GET_CODE_POINT(value);
349        }
350    }
351
352    /*
353     * return no match because
354     * - match>0 && value points to string: simple conversion cannot handle multiple code points
355     * - match>0 && match!=length: not all input consumed, forbidden for this function
356     * - match==0: no match found in the first place
357     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
358     */
359    return 0xfffe;
360}
361
362/*
363 * continue partial match with new input
364 * never called for simple, single-character conversion
365 */
366U_CFUNC void
367ucnv_extContinueMatchToU(UConverter *cnv,
368                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
369                         UErrorCode *pErrorCode) {
370    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
371    int32_t match, length;
372
373    match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
374                           cnv->preToU, cnv->preToULength,
375                           pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
376                           &value,
377                           cnv->useFallback, pArgs->flush);
378    if(match>0) {
379        if(match>=cnv->preToULength) {
380            /* advance src pointer for the consumed input */
381            pArgs->source+=match-cnv->preToULength;
382            cnv->preToULength=0;
383        } else {
384            /* the match did not use all of preToU[] - keep the rest for replay */
385            length=cnv->preToULength-match;
386            uprv_memmove(cnv->preToU, cnv->preToU+match, length);
387            cnv->preToULength=(int8_t)-length;
388        }
389
390        /* write result */
391        ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
392                         value,
393                         &pArgs->target, pArgs->targetLimit,
394                         &pArgs->offsets, srcIndex,
395                         pErrorCode);
396    } else if(match<0) {
397        /* save state for partial match */
398        const char *s;
399        int32_t j;
400
401        /* just _append_ the newly consumed input to preToU[] */
402        s=pArgs->source;
403        match=-match;
404        for(j=cnv->preToULength; j<match; ++j) {
405            cnv->preToU[j]=*s++;
406        }
407        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
408        cnv->preToULength=(int8_t)match;
409    } else /* match==0 */ {
410        /*
411         * no match
412         *
413         * We need to split the previous input into two parts:
414         *
415         * 1. The first codepage character is unmappable - that's how we got into
416         *    trying the extension data in the first place.
417         *    We need to move it from the preToU buffer
418         *    to the error buffer, set an error code,
419         *    and prepare the rest of the previous input for 2.
420         *
421         * 2. The rest of the previous input must be converted once we
422         *    come back from the callback for the first character.
423         *    At that time, we have to try again from scratch to convert
424         *    these input characters.
425         *    The replay will be handled by the ucnv.c conversion code.
426         */
427
428        /* move the first codepage character to the error field */
429        uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
430        cnv->toULength=cnv->preToUFirstLength;
431
432        /* move the rest up inside the buffer */
433        length=cnv->preToULength-cnv->preToUFirstLength;
434        if(length>0) {
435            uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
436        }
437
438        /* mark preToU for replay */
439        cnv->preToULength=(int8_t)-length;
440
441        /* set the error code for unassigned */
442        *pErrorCode=U_INVALID_CHAR_FOUND;
443    }
444}
445
446/* from Unicode ------------------------------------------------------------- */
447
448/*
449 * @return index of the UChar, if found; else <0
450 */
451static U_INLINE int32_t
452ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
453    int32_t i, start, limit;
454
455    /* binary search */
456    start=0;
457    limit=length;
458    for(;;) {
459        i=limit-start;
460        if(i<=1) {
461            break; /* done */
462        }
463        /* start<limit-1 */
464
465        if(i<=4) {
466            /* linear search for the last part */
467            if(u<=fromUSection[start]) {
468                break;
469            }
470            if(++start<limit && u<=fromUSection[start]) {
471                break;
472            }
473            if(++start<limit && u<=fromUSection[start]) {
474                break;
475            }
476            /* always break at start==limit-1 */
477            ++start;
478            break;
479        }
480
481        i=(start+limit)/2;
482        if(u<fromUSection[i]) {
483            limit=i;
484        } else {
485            start=i;
486        }
487    }
488
489    /* did we really find it? */
490    if(start<limit && u==fromUSection[start]) {
491        return start;
492    } else {
493        return -1; /* not found */
494    }
495}
496
497/*
498 * @param cx pointer to extension data; if NULL, returns 0
499 * @param firstCP the first code point before all the other UChars
500 * @param pre UChars that must match; !initialMatch: partial match with them
501 * @param preLength length of pre, >=0
502 * @param src UChars that can be used to complete a match
503 * @param srcLength length of src, >=0
504 * @param pMatchValue [out] output result value for the match from the data structure
505 * @param useFallback "use fallback" flag, usually from cnv->useFallback
506 * @param flush TRUE if the end of the input stream is reached
507 * @return >1: matched, return value=total match length (number of input units matched)
508 *          1: matched, no mapping but request for <subchar1>
509 *             (only for the first code point)
510 *          0: no match
511 *         <0: partial match, return value=negative total match length
512 *             (partial matches are never returned for flush==TRUE)
513 *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
514 *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
515 *         further code units matched
516 */
517static int32_t
518ucnv_extMatchFromU(const int32_t *cx,
519                   UChar32 firstCP,
520                   const UChar *pre, int32_t preLength,
521                   const UChar *src, int32_t srcLength,
522                   uint32_t *pMatchValue,
523                   UBool useFallback, UBool flush) {
524    const uint16_t *stage12, *stage3;
525    const uint32_t *stage3b;
526
527    const UChar *fromUTableUChars, *fromUSectionUChars;
528    const uint32_t *fromUTableValues, *fromUSectionValues;
529
530    uint32_t value, matchValue;
531    int32_t i, j, idx, length, matchLength;
532    UChar c;
533
534    if(cx==NULL) {
535        return 0; /* no extension data, no match */
536    }
537
538    /* trie lookup of firstCP */
539    idx=firstCP>>10; /* stage 1 index */
540    if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
541        return 0; /* the first code point is outside the trie */
542    }
543
544    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
545    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
546    idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
547
548    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
549    value=stage3b[idx];
550    if(value==0) {
551        return 0;
552    }
553
554    /*
555     * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
556     * Do not interpret values with reserved bits used, for forward compatibility,
557     * and do not even remember intermediate results with reserved bits used.
558     */
559
560    if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
561        /* partial match, enter the loop below */
562        idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
563
564        /* initialize */
565        fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
566        fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
567
568        matchValue=0;
569        i=j=matchLength=0;
570
571        /* we must not remember fallback matches when not using fallbacks */
572
573        /* match input units until there is a full match or the input is consumed */
574        for(;;) {
575            /* go to the next section */
576            fromUSectionUChars=fromUTableUChars+idx;
577            fromUSectionValues=fromUTableValues+idx;
578
579            /* read first pair of the section */
580            length=*fromUSectionUChars++;
581            value=*fromUSectionValues++;
582            if( value!=0 &&
583                (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
584                 FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
585                (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
586            ) {
587                /* remember longest match so far */
588                matchValue=value;
589                matchLength=2+i+j;
590            }
591
592            /* match pre[] then src[] */
593            if(i<preLength) {
594                c=pre[i++];
595            } else if(j<srcLength) {
596                c=src[j++];
597            } else {
598                /* all input consumed, partial match */
599                if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
600                    /*
601                     * end of the entire input stream, stop with the longest match so far
602                     * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
603                     * because it must fit into state buffers
604                     */
605                    break;
606                } else {
607                    /* continue with more input next time */
608                    return -(2+length);
609                }
610            }
611
612            /* search for the current UChar */
613            idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
614            if(idx<0) {
615                /* no match here, stop with the longest match so far */
616                break;
617            } else {
618                value=fromUSectionValues[idx];
619                if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
620                    /* partial match, continue */
621                    idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
622                } else {
623                    if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
624                         FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
625                        (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
626                    ) {
627                        /* full match, stop with result */
628                        matchValue=value;
629                        matchLength=2+i+j;
630                    } else {
631                        /* full match on fallback not taken, stop with the longest match so far */
632                    }
633                    break;
634                }
635            }
636        }
637
638        if(matchLength==0) {
639            /* no match at all */
640            return 0;
641        }
642    } else /* result from firstCP trie lookup */ {
643        if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
644             FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
645            (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
646        ) {
647            /* full match, stop with result */
648            matchValue=value;
649            matchLength=2;
650        } else {
651            /* fallback not taken */
652            return 0;
653        }
654    }
655
656    /* return result */
657    if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
658        return 1; /* assert matchLength==2 */
659    }
660
661    *pMatchValue=matchValue;
662    return matchLength;
663}
664
665/*
666 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
667 */
668static U_INLINE void
669ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
670                   uint32_t value,
671                   char **target, const char *targetLimit,
672                   int32_t **offsets, int32_t srcIndex,
673                   UErrorCode *pErrorCode) {
674    uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
675    const uint8_t *result;
676    int32_t length, prevLength;
677
678    length=UCNV_EXT_FROM_U_GET_LENGTH(value);
679    value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
680
681    /* output the result */
682    if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
683        /*
684         * Generate a byte array and then write it below.
685         * This is not the fastest possible way, but it should be ok for
686         * extension mappings, and it is much simpler.
687         * Offset and overflow handling are only done once this way.
688         */
689        uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
690        switch(length) {
691        case 3:
692            *p++=(uint8_t)(value>>16);
693        case 2:
694            *p++=(uint8_t)(value>>8);
695        case 1:
696            *p++=(uint8_t)value;
697        default:
698            break; /* will never occur */
699        }
700        result=buffer+1;
701    } else {
702        result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
703    }
704
705    /* with correct data we have length>0 */
706
707    if((prevLength=cnv->fromUnicodeStatus)!=0) {
708        /* handle SI/SO stateful output */
709        uint8_t shiftByte;
710
711        if(prevLength>1 && length==1) {
712            /* change from double-byte mode to single-byte */
713            shiftByte=(uint8_t)UCNV_SI;
714            cnv->fromUnicodeStatus=1;
715        } else if(prevLength==1 && length>1) {
716            /* change from single-byte mode to double-byte */
717            shiftByte=(uint8_t)UCNV_SO;
718            cnv->fromUnicodeStatus=2;
719        } else {
720            shiftByte=0;
721        }
722
723        if(shiftByte!=0) {
724            /* prepend the shift byte to the result bytes */
725            buffer[0]=shiftByte;
726            if(result!=buffer+1) {
727                uprv_memcpy(buffer+1, result, length);
728            }
729            result=buffer;
730            ++length;
731        }
732    }
733
734    ucnv_fromUWriteBytes(cnv, (const char *)result, length,
735                         target, targetLimit,
736                         offsets, srcIndex,
737                         pErrorCode);
738}
739
740/*
741 * target<targetLimit; set error code for overflow
742 */
743U_CFUNC UBool
744ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
745                          UChar32 cp,
746                          const UChar **src, const UChar *srcLimit,
747                          char **target, const char *targetLimit,
748                          int32_t **offsets, int32_t srcIndex,
749                          UBool flush,
750                          UErrorCode *pErrorCode) {
751    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
752    int32_t match;
753
754    /* try to match */
755    match=ucnv_extMatchFromU(cx, cp,
756                             NULL, 0,
757                             *src, (int32_t)(srcLimit-*src),
758                             &value,
759                             cnv->useFallback, flush);
760
761    /* reject a match if the result is a single byte for DBCS-only */
762    if( match>=2 &&
763        !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
764          cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
765    ) {
766        /* advance src pointer for the consumed input */
767        *src+=match-2; /* remove 2 for the initial code point */
768
769        /* write result to target */
770        ucnv_extWriteFromU(cnv, cx,
771                           value,
772                           target, targetLimit,
773                           offsets, srcIndex,
774                           pErrorCode);
775        return TRUE;
776    } else if(match<0) {
777        /* save state for partial match */
778        const UChar *s;
779        int32_t j;
780
781        /* copy the first code point */
782        cnv->preFromUFirstCP=cp;
783
784        /* now copy the newly consumed input */
785        s=*src;
786        match=-match-2; /* remove 2 for the initial code point */
787        for(j=0; j<match; ++j) {
788            cnv->preFromU[j]=*s++;
789        }
790        *src=s; /* same as *src=srcLimit; because we reached the end of input */
791        cnv->preFromULength=(int8_t)match;
792        return TRUE;
793    } else if(match==1) {
794        /* matched, no mapping but request for <subchar1> */
795        cnv->useSubChar1=TRUE;
796        return FALSE;
797    } else /* match==0 no match */ {
798        return FALSE;
799    }
800}
801
802/*
803 * Used by ISO 2022 implementation.
804 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
805 */
806U_CFUNC int32_t
807ucnv_extSimpleMatchFromU(const int32_t *cx,
808                         UChar32 cp, uint32_t *pValue,
809                         UBool useFallback) {
810    uint32_t value;
811    int32_t match;
812
813    /* try to match */
814    match=ucnv_extMatchFromU(cx,
815                             cp,
816                             NULL, 0,
817                             NULL, 0,
818                             &value,
819                             useFallback, TRUE);
820    if(match>=2) {
821        /* write result for simple, single-character conversion */
822        int32_t length;
823        int isRoundtrip;
824
825        isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
826        length=UCNV_EXT_FROM_U_GET_LENGTH(value);
827        value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
828
829        if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
830            *pValue=value;
831            return isRoundtrip ? length : -length;
832#if 0 /* not currently used */
833        } else if(length==4) {
834            /* de-serialize a 4-byte result */
835            const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
836            *pValue=
837                ((uint32_t)result[0]<<24)|
838                ((uint32_t)result[1]<<16)|
839                ((uint32_t)result[2]<<8)|
840                result[3];
841            return isRoundtrip ? 4 : -4;
842#endif
843        }
844    }
845
846    /*
847     * return no match because
848     * - match>1 && resultLength>4: result too long for simple conversion
849     * - match==1: no match found, <subchar1> preferred
850     * - match==0: no match found in the first place
851     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
852     */
853    return 0;
854}
855
856/*
857 * continue partial match with new input, requires cnv->preFromUFirstCP>=0
858 * never called for simple, single-character conversion
859 */
860U_CFUNC void
861ucnv_extContinueMatchFromU(UConverter *cnv,
862                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
863                           UErrorCode *pErrorCode) {
864    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
865    int32_t match;
866
867    match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
868                             cnv->preFromUFirstCP,
869                             cnv->preFromU, cnv->preFromULength,
870                             pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
871                             &value,
872                             cnv->useFallback, pArgs->flush);
873    if(match>=2) {
874        match-=2; /* remove 2 for the initial code point */
875
876        if(match>=cnv->preFromULength) {
877            /* advance src pointer for the consumed input */
878            pArgs->source+=match-cnv->preFromULength;
879            cnv->preFromULength=0;
880        } else {
881            /* the match did not use all of preFromU[] - keep the rest for replay */
882            int32_t length=cnv->preFromULength-match;
883            uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
884            cnv->preFromULength=(int8_t)-length;
885        }
886
887        /* finish the partial match */
888        cnv->preFromUFirstCP=U_SENTINEL;
889
890        /* write result */
891        ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
892                           value,
893                           &pArgs->target, pArgs->targetLimit,
894                           &pArgs->offsets, srcIndex,
895                           pErrorCode);
896    } else if(match<0) {
897        /* save state for partial match */
898        const UChar *s;
899        int32_t j;
900
901        /* just _append_ the newly consumed input to preFromU[] */
902        s=pArgs->source;
903        match=-match-2; /* remove 2 for the initial code point */
904        for(j=cnv->preFromULength; j<match; ++j) {
905            cnv->preFromU[j]=*s++;
906        }
907        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
908        cnv->preFromULength=(int8_t)match;
909    } else /* match==0 or 1 */ {
910        /*
911         * no match
912         *
913         * We need to split the previous input into two parts:
914         *
915         * 1. The first code point is unmappable - that's how we got into
916         *    trying the extension data in the first place.
917         *    We need to move it from the preFromU buffer
918         *    to the error buffer, set an error code,
919         *    and prepare the rest of the previous input for 2.
920         *
921         * 2. The rest of the previous input must be converted once we
922         *    come back from the callback for the first code point.
923         *    At that time, we have to try again from scratch to convert
924         *    these input characters.
925         *    The replay will be handled by the ucnv.c conversion code.
926         */
927
928        if(match==1) {
929            /* matched, no mapping but request for <subchar1> */
930            cnv->useSubChar1=TRUE;
931        }
932
933        /* move the first code point to the error field */
934        cnv->fromUChar32=cnv->preFromUFirstCP;
935        cnv->preFromUFirstCP=U_SENTINEL;
936
937        /* mark preFromU for replay */
938        cnv->preFromULength=-cnv->preFromULength;
939
940        /* set the error code for unassigned */
941        *pErrorCode=U_INVALID_CHAR_FOUND;
942    }
943}
944
945static void
946ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
947                            const int32_t *cx,
948                            const USetAdder *sa,
949                            UBool useFallback,
950                            int32_t minLength,
951                            UChar32 c,
952                            UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
953                            int32_t sectionIndex,
954                            UErrorCode *pErrorCode) {
955    const UChar *fromUSectionUChars;
956    const uint32_t *fromUSectionValues;
957
958    uint32_t value;
959    int32_t i, count;
960
961    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
962    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
963
964    /* read first pair of the section */
965    count=*fromUSectionUChars++;
966    value=*fromUSectionValues++;
967
968    if( value!=0 &&
969        (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
970        UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
971    ) {
972        if(c>=0) {
973            /* add the initial code point */
974            sa->add(sa->set, c);
975        } else {
976            /* add the string so far */
977            sa->addString(sa->set, s, length);
978        }
979    }
980
981    for(i=0; i<count; ++i) {
982        /* append this code unit and recurse or add the string */
983        s[length]=fromUSectionUChars[i];
984        value=fromUSectionValues[i];
985
986        if(value==0) {
987            /* no mapping, do nothing */
988        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
989            ucnv_extGetUnicodeSetString(
990                sharedData, cx, sa, useFallback, minLength,
991                U_SENTINEL, s, length+1,
992                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
993                pErrorCode);
994        } else if((useFallback ?
995                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
996                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
997                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
998                  UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
999        ) {
1000            sa->addString(sa->set, s, length+1);
1001        }
1002    }
1003}
1004
1005U_CFUNC void
1006ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
1007                      const USetAdder *sa,
1008                      UConverterUnicodeSet which,
1009                      UConverterSetFilter filter,
1010                      UErrorCode *pErrorCode) {
1011    const int32_t *cx;
1012    const uint16_t *stage12, *stage3, *ps2, *ps3;
1013    const uint32_t *stage3b;
1014
1015    uint32_t value;
1016    int32_t st1, stage1Length, st2, st3, minLength;
1017    UBool useFallback;
1018
1019    UChar s[UCNV_EXT_MAX_UCHARS];
1020    UChar32 c;
1021    int32_t length;
1022
1023    cx=sharedData->mbcs.extIndexes;
1024    if(cx==NULL) {
1025        return;
1026    }
1027
1028    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
1029    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
1030    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
1031
1032    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
1033
1034    useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
1035
1036    /* enumerate the from-Unicode trie table */
1037    c=0; /* keep track of the current code point while enumerating */
1038
1039    if(filter==UCNV_SET_FILTER_2022_CN) {
1040        minLength=3;
1041    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
1042               filter!=UCNV_SET_FILTER_NONE
1043    ) {
1044        /* DBCS-only, ignore single-byte results */
1045        minLength=2;
1046    } else {
1047        minLength=1;
1048    }
1049
1050    /*
1051     * the trie enumeration is almost the same as
1052     * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
1053     */
1054    for(st1=0; st1<stage1Length; ++st1) {
1055        st2=stage12[st1];
1056        if(st2>stage1Length) {
1057            ps2=stage12+st2;
1058            for(st2=0; st2<64; ++st2) {
1059                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
1060                    /* read the stage 3 block */
1061                    ps3=stage3+st3;
1062
1063                    /*
1064                     * Add code points for which the roundtrip flag is set.
1065                     * Do not add <subchar1> entries or other (future?) pseudo-entries
1066                     * with an output length of 0, or entries with reserved bits set.
1067                     * Recurse for partial results.
1068                     */
1069                    do {
1070                        value=stage3b[*ps3++];
1071                        if(value==0) {
1072                            /* no mapping, do nothing */
1073                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
1074                            length=0;
1075                            U16_APPEND_UNSAFE(s, length, c);
1076                            ucnv_extGetUnicodeSetString(
1077                                sharedData, cx, sa, useFallback, minLength,
1078                                c, s, length,
1079                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
1080                                pErrorCode);
1081                        } else if((useFallback ?
1082                                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
1083                                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
1084                                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
1085                                  UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
1086                        ) {
1087                            switch(filter) {
1088                            case UCNV_SET_FILTER_2022_CN:
1089                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
1090                                    continue;
1091                                }
1092                                break;
1093                            case UCNV_SET_FILTER_SJIS:
1094                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
1095                                    continue;
1096                                }
1097                                break;
1098                            case UCNV_SET_FILTER_GR94DBCS:
1099                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
1100                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
1101                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
1102                                    continue;
1103                                }
1104                                break;
1105                            case UCNV_SET_FILTER_HZ:
1106                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
1107                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
1108                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
1109                                    continue;
1110                                }
1111                                break;
1112                            default:
1113                                /*
1114                                 * UCNV_SET_FILTER_NONE,
1115                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
1116                                 */
1117                                break;
1118                            }
1119                            sa->add(sa->set, c);
1120                        }
1121                    } while((++c&0xf)!=0);
1122                } else {
1123                    c+=16; /* empty stage 3 block */
1124                }
1125            }
1126        } else {
1127            c+=1024; /* empty stage 2 block */
1128        }
1129    }
1130}
1131
1132#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1133