1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6*   Copyright (C) 2002-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9******************************************************************************
10*   file name:  ucnvbocu.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2002mar27
16*   created by: Markus W. Scherer
17*
18*   This is an implementation of the Binary Ordered Compression for Unicode,
19*   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20*/
21
22#include "unicode/utypes.h"
23
24#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25
26#include "unicode/ucnv.h"
27#include "unicode/ucnv_cb.h"
28#include "unicode/utf16.h"
29#include "putilimp.h"
30#include "ucnv_bld.h"
31#include "ucnv_cnv.h"
32#include "uassert.h"
33
34/* BOCU-1 constants and macros ---------------------------------------------- */
35
36/*
37 * BOCU-1 encodes the code points of a Unicode string as
38 * a sequence of byte-encoded differences (slope detection),
39 * preserving lexical order.
40 *
41 * Optimize the difference-taking for runs of Unicode text within
42 * small scripts:
43 *
44 * Most small scripts are allocated within aligned 128-blocks of Unicode
45 * code points. Lexical order is preserved if the "previous code point" state
46 * is always moved into the middle of such a block.
47 *
48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49 * areas into the middle of those areas.
50 *
51 * C0 control codes and space are encoded with their US-ASCII bytes.
52 * "prev" is reset for C0 controls but not for space.
53 */
54
55/* initial value for "prev": middle of the ASCII range */
56#define BOCU1_ASCII_PREV        0x40
57
58/* bounding byte values for differences */
59#define BOCU1_MIN               0x21
60#define BOCU1_MIDDLE            0x90
61#define BOCU1_MAX_LEAD          0xfe
62#define BOCU1_MAX_TRAIL         0xff
63#define BOCU1_RESET             0xff
64
65/* number of lead bytes */
66#define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67
68/* adjust trail byte counts for the use of some C0 control byte values */
69#define BOCU1_TRAIL_CONTROLS_COUNT  20
70#define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71
72/* number of trail bytes */
73#define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74
75/*
76 * number of positive and negative single-byte codes
77 * (counting 0==BOCU1_MIDDLE among the positive ones)
78 */
79#define BOCU1_SINGLE            64
80
81/* number of lead bytes for positive and negative 2/3/4-byte sequences */
82#define BOCU1_LEAD_2            43
83#define BOCU1_LEAD_3            3
84#define BOCU1_LEAD_4            1
85
86/* The difference value range for single-byters. */
87#define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
88#define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
89
90/* The difference value range for double-byters. */
91#define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92#define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93
94/* The difference value range for 3-byters. */
95#define BOCU1_REACH_POS_3   \
96    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98#define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
99
100/* The lead byte start values. */
101#define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102#define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
103#define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
104     /* ==BOCU1_MAX_LEAD */
105
106#define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107#define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108#define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109     /* ==BOCU1_MIN+1 */
110
111/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112#define BOCU1_LENGTH_FROM_LEAD(lead) \
113    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116
117/* The length of a byte sequence, according to its packed form. */
118#define BOCU1_LENGTH_FROM_PACKED(packed) \
119    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120
121/*
122 * 12 commonly used C0 control codes (and space) are only used to encode
123 * themselves directly,
124 * which makes BOCU-1 MIME-usable and reasonably safe for
125 * ASCII-oriented software.
126 *
127 * These controls are
128 *  0   NUL
129 *
130 *  7   BEL
131 *  8   BS
132 *
133 *  9   TAB
134 *  a   LF
135 *  b   VT
136 *  c   FF
137 *  d   CR
138 *
139 *  e   SO
140 *  f   SI
141 *
142 * 1a   SUB
143 * 1b   ESC
144 *
145 * The other 20 C0 controls are also encoded directly (to preserve order)
146 * but are also used as trail bytes in difference encoding
147 * (for better compression).
148 */
149#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150
151/*
152 * Byte value map for control codes,
153 * from external byte values 0x00..0x20
154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155 * External byte values that are illegal as trail bytes are mapped to -1.
156 */
157static const int8_t
158bocu1ByteToTrail[BOCU1_MIN]={
159/*  0     1     2     3     4     5     6     7    */
160    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
161
162/*  8     9     a     b     c     d     e     f    */
163    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
164
165/*  10    11    12    13    14    15    16    17   */
166    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
167
168/*  18    19    1a    1b    1c    1d    1e    1f   */
169    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
170
171/*  20   */
172    -1
173};
174
175/*
176 * Byte value map for control codes,
177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178 * to external byte values 0x00..0x20.
179 */
180static const int8_t
181bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182/*  0     1     2     3     4     5     6     7    */
183    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
184
185/*  8     9     a     b     c     d     e     f    */
186    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187
188/*  10    11    12    13   */
189    0x1c, 0x1d, 0x1e, 0x1f
190};
191
192/**
193 * Integer division and modulo with negative numerators
194 * yields negative modulo results and quotients that are one more than
195 * what we need here.
196 * This macro adjust the results so that the modulo-value m is always >=0.
197 *
198 * For positive n, the if() condition is always FALSE.
199 *
200 * @param n Number to be split into quotient and rest.
201 *          Will be modified to contain the quotient.
202 * @param d Divisor.
203 * @param m Output variable for the rest (modulo result).
204 */
205#define NEGDIVMOD(n, d, m) { \
206    (m)=(n)%(d); \
207    (n)/=(d); \
208    if((m)<0) { \
209        --(n); \
210        (m)+=(d); \
211    } \
212}
213
214/* Faster versions of packDiff() for single-byte-encoded diff values. */
215
216/** Is a diff value encodable in a single byte? */
217#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218
219/** Encode a diff value in a single byte. */
220#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221
222/** Is a diff value encodable in two bytes? */
223#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224
225/* BOCU-1 implementation functions ------------------------------------------ */
226
227#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228
229/**
230 * Compute the next "previous" value for differencing
231 * from the current code point.
232 *
233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234 * @return "previous code point" state value
235 */
236static inline int32_t
237bocu1Prev(int32_t c) {
238    /* compute new prev */
239    if(/* 0x3040<=c && */ c<=0x309f) {
240        /* Hiragana is not 128-aligned */
241        return 0x3070;
242    } else if(0x4e00<=c && c<=0x9fa5) {
243        /* CJK Unihan */
244        return 0x4e00-BOCU1_REACH_NEG_2;
245    } else if(0xac00<=c /* && c<=0xd7a3 */) {
246        /* Korean Hangul */
247        return (0xd7a3+0xac00)/2;
248    } else {
249        /* mostly small scripts */
250        return BOCU1_SIMPLE_PREV(c);
251    }
252}
253
254/** Fast version of bocu1Prev() for most scripts. */
255#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256
257/*
258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259 * The UConverter fields are used as follows:
260 *
261 * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 *
263 * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264 * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265 */
266
267/* BOCU-1-from-Unicode conversion functions --------------------------------- */
268
269/**
270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271 * and return a packed integer with them.
272 *
273 * The encoding favors small absolute differences with short encodings
274 * to compress runs of same-script characters.
275 *
276 * Optimized version with unrolled loops and fewer floating-point operations
277 * than the standard packDiff().
278 *
279 * @param diff difference value -0x10ffff..0x10ffff
280 * @return
281 *      0x010000zz for 1-byte sequence zz
282 *      0x0200yyzz for 2-byte sequence yy zz
283 *      0x03xxyyzz for 3-byte sequence xx yy zz
284 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 */
286static int32_t
287packDiff(int32_t diff) {
288    int32_t result, m;
289
290    U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291    if(diff>=BOCU1_REACH_NEG_1) {
292        /* mostly positive differences, and single-byte negative ones */
293#if 0   /* single-byte case handled in macros, see below */
294        if(diff<=BOCU1_REACH_POS_1) {
295            /* single byte */
296            return 0x01000000|(BOCU1_MIDDLE+diff);
297        } else
298#endif
299        if(diff<=BOCU1_REACH_POS_2) {
300            /* two bytes */
301            diff-=BOCU1_REACH_POS_1+1;
302            result=0x02000000;
303
304            m=diff%BOCU1_TRAIL_COUNT;
305            diff/=BOCU1_TRAIL_COUNT;
306            result|=BOCU1_TRAIL_TO_BYTE(m);
307
308            result|=(BOCU1_START_POS_2+diff)<<8;
309        } else if(diff<=BOCU1_REACH_POS_3) {
310            /* three bytes */
311            diff-=BOCU1_REACH_POS_2+1;
312            result=0x03000000;
313
314            m=diff%BOCU1_TRAIL_COUNT;
315            diff/=BOCU1_TRAIL_COUNT;
316            result|=BOCU1_TRAIL_TO_BYTE(m);
317
318            m=diff%BOCU1_TRAIL_COUNT;
319            diff/=BOCU1_TRAIL_COUNT;
320            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
321
322            result|=(BOCU1_START_POS_3+diff)<<16;
323        } else {
324            /* four bytes */
325            diff-=BOCU1_REACH_POS_3+1;
326
327            m=diff%BOCU1_TRAIL_COUNT;
328            diff/=BOCU1_TRAIL_COUNT;
329            result=BOCU1_TRAIL_TO_BYTE(m);
330
331            m=diff%BOCU1_TRAIL_COUNT;
332            diff/=BOCU1_TRAIL_COUNT;
333            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
334
335            /*
336             * We know that / and % would deliver quotient 0 and rest=diff.
337             * Avoid division and modulo for performance.
338             */
339            result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
340
341            result|=((uint32_t)BOCU1_START_POS_4)<<24;
342        }
343    } else {
344        /* two- to four-byte negative differences */
345        if(diff>=BOCU1_REACH_NEG_2) {
346            /* two bytes */
347            diff-=BOCU1_REACH_NEG_1;
348            result=0x02000000;
349
350            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351            result|=BOCU1_TRAIL_TO_BYTE(m);
352
353            result|=(BOCU1_START_NEG_2+diff)<<8;
354        } else if(diff>=BOCU1_REACH_NEG_3) {
355            /* three bytes */
356            diff-=BOCU1_REACH_NEG_2;
357            result=0x03000000;
358
359            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360            result|=BOCU1_TRAIL_TO_BYTE(m);
361
362            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
364
365            result|=(BOCU1_START_NEG_3+diff)<<16;
366        } else {
367            /* four bytes */
368            diff-=BOCU1_REACH_NEG_3;
369
370            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371            result=BOCU1_TRAIL_TO_BYTE(m);
372
373            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
375
376            /*
377             * We know that NEGDIVMOD would deliver
378             * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379             * Avoid division and modulo for performance.
380             */
381            m=diff+BOCU1_TRAIL_COUNT;
382            result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
383
384            result|=BOCU1_MIN<<24;
385        }
386    }
387    return result;
388}
389
390
391static void U_CALLCONV
392_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393                             UErrorCode *pErrorCode) {
394    UConverter *cnv;
395    const UChar *source, *sourceLimit;
396    uint8_t *target;
397    int32_t targetCapacity;
398    int32_t *offsets;
399
400    int32_t prev, c, diff;
401
402    int32_t sourceIndex, nextSourceIndex;
403
404    /* set up the local pointers */
405    cnv=pArgs->converter;
406    source=pArgs->source;
407    sourceLimit=pArgs->sourceLimit;
408    target=(uint8_t *)pArgs->target;
409    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410    offsets=pArgs->offsets;
411
412    /* get the converter state from UConverter */
413    c=cnv->fromUChar32;
414    prev=(int32_t)cnv->fromUnicodeStatus;
415    if(prev==0) {
416        prev=BOCU1_ASCII_PREV;
417    }
418
419    /* sourceIndex=-1 if the current character began in the previous buffer */
420    sourceIndex= c==0 ? 0 : -1;
421    nextSourceIndex=0;
422
423    /* conversion loop */
424    if(c!=0 && targetCapacity>0) {
425        goto getTrail;
426    }
427
428fastSingle:
429    /* fast loop for single-byte differences */
430    /* use only one loop counter variable, targetCapacity, not also source */
431    diff=(int32_t)(sourceLimit-source);
432    if(targetCapacity>diff) {
433        targetCapacity=diff;
434    }
435    while(targetCapacity>0 && (c=*source)<0x3000) {
436        if(c<=0x20) {
437            if(c!=0x20) {
438                prev=BOCU1_ASCII_PREV;
439            }
440            *target++=(uint8_t)c;
441            *offsets++=nextSourceIndex++;
442            ++source;
443            --targetCapacity;
444        } else {
445            diff=c-prev;
446            if(DIFF_IS_SINGLE(diff)) {
447                prev=BOCU1_SIMPLE_PREV(c);
448                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449                *offsets++=nextSourceIndex++;
450                ++source;
451                --targetCapacity;
452            } else {
453                break;
454            }
455        }
456    }
457    /* restore real values */
458    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460
461    /* regular loop for all cases */
462    while(source<sourceLimit) {
463        if(targetCapacity>0) {
464            c=*source++;
465            ++nextSourceIndex;
466
467            if(c<=0x20) {
468                /*
469                 * ISO C0 control & space:
470                 * Encode directly for MIME compatibility,
471                 * and reset state except for space, to not disrupt compression.
472                 */
473                if(c!=0x20) {
474                    prev=BOCU1_ASCII_PREV;
475                }
476                *target++=(uint8_t)c;
477                *offsets++=sourceIndex;
478                --targetCapacity;
479
480                sourceIndex=nextSourceIndex;
481                continue;
482            }
483
484            if(U16_IS_LEAD(c)) {
485getTrail:
486                if(source<sourceLimit) {
487                    /* test the following code unit */
488                    UChar trail=*source;
489                    if(U16_IS_TRAIL(trail)) {
490                        ++source;
491                        ++nextSourceIndex;
492                        c=U16_GET_SUPPLEMENTARY(c, trail);
493                    }
494                } else {
495                    /* no more input */
496                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497                    break;
498                }
499            }
500
501            /*
502             * all other Unicode code points c==U+0021..U+10ffff
503             * are encoded with the difference c-prev
504             *
505             * a new prev is computed from c,
506             * placed in the middle of a 0x80-block (for most small scripts) or
507             * in the middle of the Unihan and Hangul blocks
508             * to statistically minimize the following difference
509             */
510            diff=c-prev;
511            prev=BOCU1_PREV(c);
512            if(DIFF_IS_SINGLE(diff)) {
513                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514                *offsets++=sourceIndex;
515                --targetCapacity;
516                sourceIndex=nextSourceIndex;
517                if(c<0x3000) {
518                    goto fastSingle;
519                }
520            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521                /* optimize 2-byte case */
522                int32_t m;
523
524                if(diff>=0) {
525                    diff-=BOCU1_REACH_POS_1+1;
526                    m=diff%BOCU1_TRAIL_COUNT;
527                    diff/=BOCU1_TRAIL_COUNT;
528                    diff+=BOCU1_START_POS_2;
529                } else {
530                    diff-=BOCU1_REACH_NEG_1;
531                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532                    diff+=BOCU1_START_NEG_2;
533                }
534                *target++=(uint8_t)diff;
535                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536                *offsets++=sourceIndex;
537                *offsets++=sourceIndex;
538                targetCapacity-=2;
539                sourceIndex=nextSourceIndex;
540            } else {
541                int32_t length; /* will be 2..4 */
542
543                diff=packDiff(diff);
544                length=BOCU1_LENGTH_FROM_PACKED(diff);
545
546                /* write the output character bytes from diff and length */
547                /* from the first if in the loop we know that targetCapacity>0 */
548                if(length<=targetCapacity) {
549                    switch(length) {
550                        /* each branch falls through to the next one */
551                    case 4:
552                        *target++=(uint8_t)(diff>>24);
553                        *offsets++=sourceIndex;
554                        U_FALLTHROUGH;
555                    case 3:
556                        *target++=(uint8_t)(diff>>16);
557                        *offsets++=sourceIndex;
558                        U_FALLTHROUGH;
559                    case 2:
560                        *target++=(uint8_t)(diff>>8);
561                        *offsets++=sourceIndex;
562                    /* case 1: handled above */
563                        *target++=(uint8_t)diff;
564                        *offsets++=sourceIndex;
565                        U_FALLTHROUGH;
566                    default:
567                        /* will never occur */
568                        break;
569                    }
570                    targetCapacity-=length;
571                    sourceIndex=nextSourceIndex;
572                } else {
573                    uint8_t *charErrorBuffer;
574
575                    /*
576                     * We actually do this backwards here:
577                     * In order to save an intermediate variable, we output
578                     * first to the overflow buffer what does not fit into the
579                     * regular target.
580                     */
581                    /* we know that 1<=targetCapacity<length<=4 */
582                    length-=targetCapacity;
583                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584                    switch(length) {
585                        /* each branch falls through to the next one */
586                    case 3:
587                        *charErrorBuffer++=(uint8_t)(diff>>16);
588                        U_FALLTHROUGH;
589                    case 2:
590                        *charErrorBuffer++=(uint8_t)(diff>>8);
591                        U_FALLTHROUGH;
592                    case 1:
593                        *charErrorBuffer=(uint8_t)diff;
594                        U_FALLTHROUGH;
595                    default:
596                        /* will never occur */
597                        break;
598                    }
599                    cnv->charErrorBufferLength=(int8_t)length;
600
601                    /* now output what fits into the regular target */
602                    diff>>=8*length; /* length was reduced by targetCapacity */
603                    switch(targetCapacity) {
604                        /* each branch falls through to the next one */
605                    case 3:
606                        *target++=(uint8_t)(diff>>16);
607                        *offsets++=sourceIndex;
608                        U_FALLTHROUGH;
609                    case 2:
610                        *target++=(uint8_t)(diff>>8);
611                        *offsets++=sourceIndex;
612                        U_FALLTHROUGH;
613                    case 1:
614                        *target++=(uint8_t)diff;
615                        *offsets++=sourceIndex;
616                        U_FALLTHROUGH;
617                    default:
618                        /* will never occur */
619                        break;
620                    }
621
622                    /* target overflow */
623                    targetCapacity=0;
624                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625                    break;
626                }
627            }
628        } else {
629            /* target is full */
630            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631            break;
632        }
633    }
634
635    /* set the converter state back into UConverter */
636    cnv->fromUChar32= c<0 ? -c : 0;
637    cnv->fromUnicodeStatus=(uint32_t)prev;
638
639    /* write back the updated pointers */
640    pArgs->source=source;
641    pArgs->target=(char *)target;
642    pArgs->offsets=offsets;
643}
644
645/*
646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647 * If a change is made in the original function, then either
648 * change this function the same way or
649 * re-copy the original function and remove the variables
650 * offsets, sourceIndex, and nextSourceIndex.
651 */
652static void U_CALLCONV
653_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654                  UErrorCode *pErrorCode) {
655    UConverter *cnv;
656    const UChar *source, *sourceLimit;
657    uint8_t *target;
658    int32_t targetCapacity;
659
660    int32_t prev, c, diff;
661
662    /* set up the local pointers */
663    cnv=pArgs->converter;
664    source=pArgs->source;
665    sourceLimit=pArgs->sourceLimit;
666    target=(uint8_t *)pArgs->target;
667    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668
669    /* get the converter state from UConverter */
670    c=cnv->fromUChar32;
671    prev=(int32_t)cnv->fromUnicodeStatus;
672    if(prev==0) {
673        prev=BOCU1_ASCII_PREV;
674    }
675
676    /* conversion loop */
677    if(c!=0 && targetCapacity>0) {
678        goto getTrail;
679    }
680
681fastSingle:
682    /* fast loop for single-byte differences */
683    /* use only one loop counter variable, targetCapacity, not also source */
684    diff=(int32_t)(sourceLimit-source);
685    if(targetCapacity>diff) {
686        targetCapacity=diff;
687    }
688    while(targetCapacity>0 && (c=*source)<0x3000) {
689        if(c<=0x20) {
690            if(c!=0x20) {
691                prev=BOCU1_ASCII_PREV;
692            }
693            *target++=(uint8_t)c;
694        } else {
695            diff=c-prev;
696            if(DIFF_IS_SINGLE(diff)) {
697                prev=BOCU1_SIMPLE_PREV(c);
698                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699            } else {
700                break;
701            }
702        }
703        ++source;
704        --targetCapacity;
705    }
706    /* restore real values */
707    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708
709    /* regular loop for all cases */
710    while(source<sourceLimit) {
711        if(targetCapacity>0) {
712            c=*source++;
713
714            if(c<=0x20) {
715                /*
716                 * ISO C0 control & space:
717                 * Encode directly for MIME compatibility,
718                 * and reset state except for space, to not disrupt compression.
719                 */
720                if(c!=0x20) {
721                    prev=BOCU1_ASCII_PREV;
722                }
723                *target++=(uint8_t)c;
724                --targetCapacity;
725                continue;
726            }
727
728            if(U16_IS_LEAD(c)) {
729getTrail:
730                if(source<sourceLimit) {
731                    /* test the following code unit */
732                    UChar trail=*source;
733                    if(U16_IS_TRAIL(trail)) {
734                        ++source;
735                        c=U16_GET_SUPPLEMENTARY(c, trail);
736                    }
737                } else {
738                    /* no more input */
739                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
740                    break;
741                }
742            }
743
744            /*
745             * all other Unicode code points c==U+0021..U+10ffff
746             * are encoded with the difference c-prev
747             *
748             * a new prev is computed from c,
749             * placed in the middle of a 0x80-block (for most small scripts) or
750             * in the middle of the Unihan and Hangul blocks
751             * to statistically minimize the following difference
752             */
753            diff=c-prev;
754            prev=BOCU1_PREV(c);
755            if(DIFF_IS_SINGLE(diff)) {
756                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757                --targetCapacity;
758                if(c<0x3000) {
759                    goto fastSingle;
760                }
761            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
762                /* optimize 2-byte case */
763                int32_t m;
764
765                if(diff>=0) {
766                    diff-=BOCU1_REACH_POS_1+1;
767                    m=diff%BOCU1_TRAIL_COUNT;
768                    diff/=BOCU1_TRAIL_COUNT;
769                    diff+=BOCU1_START_POS_2;
770                } else {
771                    diff-=BOCU1_REACH_NEG_1;
772                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773                    diff+=BOCU1_START_NEG_2;
774                }
775                *target++=(uint8_t)diff;
776                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777                targetCapacity-=2;
778            } else {
779                int32_t length; /* will be 2..4 */
780
781                diff=packDiff(diff);
782                length=BOCU1_LENGTH_FROM_PACKED(diff);
783
784                /* write the output character bytes from diff and length */
785                /* from the first if in the loop we know that targetCapacity>0 */
786                if(length<=targetCapacity) {
787                    switch(length) {
788                        /* each branch falls through to the next one */
789                    case 4:
790                        *target++=(uint8_t)(diff>>24);
791                        U_FALLTHROUGH;
792                    case 3:
793                        *target++=(uint8_t)(diff>>16);
794                    /* case 2: handled above */
795                        *target++=(uint8_t)(diff>>8);
796                    /* case 1: handled above */
797                        *target++=(uint8_t)diff;
798                        U_FALLTHROUGH;
799                    default:
800                        /* will never occur */
801                        break;
802                    }
803                    targetCapacity-=length;
804                } else {
805                    uint8_t *charErrorBuffer;
806
807                    /*
808                     * We actually do this backwards here:
809                     * In order to save an intermediate variable, we output
810                     * first to the overflow buffer what does not fit into the
811                     * regular target.
812                     */
813                    /* we know that 1<=targetCapacity<length<=4 */
814                    length-=targetCapacity;
815                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816                    switch(length) {
817                        /* each branch falls through to the next one */
818                    case 3:
819                        *charErrorBuffer++=(uint8_t)(diff>>16);
820                        U_FALLTHROUGH;
821                    case 2:
822                        *charErrorBuffer++=(uint8_t)(diff>>8);
823                        U_FALLTHROUGH;
824                    case 1:
825                        *charErrorBuffer=(uint8_t)diff;
826                        U_FALLTHROUGH;
827                    default:
828                        /* will never occur */
829                        break;
830                    }
831                    cnv->charErrorBufferLength=(int8_t)length;
832
833                    /* now output what fits into the regular target */
834                    diff>>=8*length; /* length was reduced by targetCapacity */
835                    switch(targetCapacity) {
836                        /* each branch falls through to the next one */
837                    case 3:
838                        *target++=(uint8_t)(diff>>16);
839                        U_FALLTHROUGH;
840                    case 2:
841                        *target++=(uint8_t)(diff>>8);
842                        U_FALLTHROUGH;
843                    case 1:
844                        *target++=(uint8_t)diff;
845                        U_FALLTHROUGH;
846                    default:
847                        /* will never occur */
848                        break;
849                    }
850
851                    /* target overflow */
852                    targetCapacity=0;
853                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854                    break;
855                }
856            }
857        } else {
858            /* target is full */
859            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860            break;
861        }
862    }
863
864    /* set the converter state back into UConverter */
865    cnv->fromUChar32= c<0 ? -c : 0;
866    cnv->fromUnicodeStatus=(uint32_t)prev;
867
868    /* write back the updated pointers */
869    pArgs->source=source;
870    pArgs->target=(char *)target;
871}
872
873/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874
875/**
876 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
877 *
878 * @param b lead byte;
879 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880 * @return (diff<<2)|count
881 */
882static inline int32_t
883decodeBocu1LeadByte(int32_t b) {
884    int32_t diff, count;
885
886    if(b>=BOCU1_START_NEG_2) {
887        /* positive difference */
888        if(b<BOCU1_START_POS_3) {
889            /* two bytes */
890            diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
891            count=1;
892        } else if(b<BOCU1_START_POS_4) {
893            /* three bytes */
894            diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
895            count=2;
896        } else {
897            /* four bytes */
898            diff=BOCU1_REACH_POS_3+1;
899            count=3;
900        }
901    } else {
902        /* negative difference */
903        if(b>=BOCU1_START_NEG_3) {
904            /* two bytes */
905            diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906            count=1;
907        } else if(b>BOCU1_MIN) {
908            /* three bytes */
909            diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910            count=2;
911        } else {
912            /* four bytes */
913            diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914            count=3;
915        }
916    }
917
918    /* return the state for decoding the trail byte(s) */
919    return (diff<<2)|count;
920}
921
922/**
923 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
924 *
925 * @param count number of remaining trail bytes including this one
926 * @param b trail byte
927 * @return new delta for diff including b - <0 indicates an error
928 *
929 * @see decodeBocu1
930 */
931static inline int32_t
932decodeBocu1TrailByte(int32_t count, int32_t b) {
933    if(b<=0x20) {
934        /* skip some C0 controls and make the trail byte range contiguous */
935        b=bocu1ByteToTrail[b];
936        /* b<0 for an illegal trail byte value will result in return<0 below */
937#if BOCU1_MAX_TRAIL<0xff
938    } else if(b>BOCU1_MAX_TRAIL) {
939        return -99;
940#endif
941    } else {
942        b-=BOCU1_TRAIL_BYTE_OFFSET;
943    }
944
945    /* add trail byte into difference and decrement count */
946    if(count==1) {
947        return b;
948    } else if(count==2) {
949        return b*BOCU1_TRAIL_COUNT;
950    } else /* count==3 */ {
951        return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
952    }
953}
954
955static void U_CALLCONV
956_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957                           UErrorCode *pErrorCode) {
958    UConverter *cnv;
959    const uint8_t *source, *sourceLimit;
960    UChar *target;
961    const UChar *targetLimit;
962    int32_t *offsets;
963
964    int32_t prev, count, diff, c;
965
966    int8_t byteIndex;
967    uint8_t *bytes;
968
969    int32_t sourceIndex, nextSourceIndex;
970
971    /* set up the local pointers */
972    cnv=pArgs->converter;
973    source=(const uint8_t *)pArgs->source;
974    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975    target=pArgs->target;
976    targetLimit=pArgs->targetLimit;
977    offsets=pArgs->offsets;
978
979    /* get the converter state from UConverter */
980    prev=(int32_t)cnv->toUnicodeStatus;
981    if(prev==0) {
982        prev=BOCU1_ASCII_PREV;
983    }
984    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
985    count=diff&3;
986    diff>>=2;
987
988    byteIndex=cnv->toULength;
989    bytes=cnv->toUBytes;
990
991    /* sourceIndex=-1 if the current character began in the previous buffer */
992    sourceIndex=byteIndex==0 ? 0 : -1;
993    nextSourceIndex=0;
994
995    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996    if(count>0 && byteIndex>0 && target<targetLimit) {
997        goto getTrail;
998    }
999
1000fastSingle:
1001    /* fast loop for single-byte differences */
1002    /* use count as the only loop counter variable */
1003    diff=(int32_t)(sourceLimit-source);
1004    count=(int32_t)(pArgs->targetLimit-target);
1005    if(count>diff) {
1006        count=diff;
1007    }
1008    while(count>0) {
1009        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010            c=prev+(c-BOCU1_MIDDLE);
1011            if(c<0x3000) {
1012                *target++=(UChar)c;
1013                *offsets++=nextSourceIndex++;
1014                prev=BOCU1_SIMPLE_PREV(c);
1015            } else {
1016                break;
1017            }
1018        } else if(c<=0x20) {
1019            if(c!=0x20) {
1020                prev=BOCU1_ASCII_PREV;
1021            }
1022            *target++=(UChar)c;
1023            *offsets++=nextSourceIndex++;
1024        } else {
1025            break;
1026        }
1027        ++source;
1028        --count;
1029    }
1030    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031
1032    /* decode a sequence of single and lead bytes */
1033    while(source<sourceLimit) {
1034        if(target>=targetLimit) {
1035            /* target is full */
1036            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037            break;
1038        }
1039
1040        ++nextSourceIndex;
1041        c=*source++;
1042        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043            /* Write a code point directly from a single-byte difference. */
1044            c=prev+(c-BOCU1_MIDDLE);
1045            if(c<0x3000) {
1046                *target++=(UChar)c;
1047                *offsets++=sourceIndex;
1048                prev=BOCU1_SIMPLE_PREV(c);
1049                sourceIndex=nextSourceIndex;
1050                goto fastSingle;
1051            }
1052        } else if(c<=0x20) {
1053            /*
1054             * Direct-encoded C0 control code or space.
1055             * Reset prev for C0 control codes but not for space.
1056             */
1057            if(c!=0x20) {
1058                prev=BOCU1_ASCII_PREV;
1059            }
1060            *target++=(UChar)c;
1061            *offsets++=sourceIndex;
1062            sourceIndex=nextSourceIndex;
1063            continue;
1064        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065            /* Optimize two-byte case. */
1066            if(c>=BOCU1_MIDDLE) {
1067                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068            } else {
1069                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070            }
1071
1072            /* trail byte */
1073            ++nextSourceIndex;
1074            c=decodeBocu1TrailByte(1, *source++);
1075            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076                bytes[0]=source[-2];
1077                bytes[1]=source[-1];
1078                byteIndex=2;
1079                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080                break;
1081            }
1082        } else if(c==BOCU1_RESET) {
1083            /* only reset the state, no code point */
1084            prev=BOCU1_ASCII_PREV;
1085            sourceIndex=nextSourceIndex;
1086            continue;
1087        } else {
1088            /*
1089             * For multi-byte difference lead bytes, set the decoder state
1090             * with the partial difference value from the lead byte and
1091             * with the number of trail bytes.
1092             */
1093            bytes[0]=(uint8_t)c;
1094            byteIndex=1;
1095
1096            diff=decodeBocu1LeadByte(c);
1097            count=diff&3;
1098            diff>>=2;
1099getTrail:
1100            for(;;) {
1101                if(source>=sourceLimit) {
1102                    goto endloop;
1103                }
1104                ++nextSourceIndex;
1105                c=bytes[byteIndex++]=*source++;
1106
1107                /* trail byte in any position */
1108                c=decodeBocu1TrailByte(count, c);
1109                if(c<0) {
1110                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111                    goto endloop;
1112                }
1113
1114                diff+=c;
1115                if(--count==0) {
1116                    /* final trail byte, deliver a code point */
1117                    byteIndex=0;
1118                    c=prev+diff;
1119                    if((uint32_t)c>0x10ffff) {
1120                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121                        goto endloop;
1122                    }
1123                    break;
1124                }
1125            }
1126        }
1127
1128        /* calculate the next prev and output c */
1129        prev=BOCU1_PREV(c);
1130        if(c<=0xffff) {
1131            *target++=(UChar)c;
1132            *offsets++=sourceIndex;
1133        } else {
1134            /* output surrogate pair */
1135            *target++=U16_LEAD(c);
1136            if(target<targetLimit) {
1137                *target++=U16_TRAIL(c);
1138                *offsets++=sourceIndex;
1139                *offsets++=sourceIndex;
1140            } else {
1141                /* target overflow */
1142                *offsets++=sourceIndex;
1143                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144                cnv->UCharErrorBufferLength=1;
1145                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146                break;
1147            }
1148        }
1149        sourceIndex=nextSourceIndex;
1150    }
1151endloop:
1152
1153    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154        /* set the converter state in UConverter to deal with the next character */
1155        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156        cnv->mode=0;
1157    } else {
1158        /* set the converter state back into UConverter */
1159        cnv->toUnicodeStatus=(uint32_t)prev;
1160        cnv->mode=(diff<<2)|count;
1161    }
1162    cnv->toULength=byteIndex;
1163
1164    /* write back the updated pointers */
1165    pArgs->source=(const char *)source;
1166    pArgs->target=target;
1167    pArgs->offsets=offsets;
1168    return;
1169}
1170
1171/*
1172 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173 * If a change is made in the original function, then either
1174 * change this function the same way or
1175 * re-copy the original function and remove the variables
1176 * offsets, sourceIndex, and nextSourceIndex.
1177 */
1178static void U_CALLCONV
1179_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180                UErrorCode *pErrorCode) {
1181    UConverter *cnv;
1182    const uint8_t *source, *sourceLimit;
1183    UChar *target;
1184    const UChar *targetLimit;
1185
1186    int32_t prev, count, diff, c;
1187
1188    int8_t byteIndex;
1189    uint8_t *bytes;
1190
1191    /* set up the local pointers */
1192    cnv=pArgs->converter;
1193    source=(const uint8_t *)pArgs->source;
1194    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195    target=pArgs->target;
1196    targetLimit=pArgs->targetLimit;
1197
1198    /* get the converter state from UConverter */
1199    prev=(int32_t)cnv->toUnicodeStatus;
1200    if(prev==0) {
1201        prev=BOCU1_ASCII_PREV;
1202    }
1203    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204    count=diff&3;
1205    diff>>=2;
1206
1207    byteIndex=cnv->toULength;
1208    bytes=cnv->toUBytes;
1209
1210    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211    if(count>0 && byteIndex>0 && target<targetLimit) {
1212        goto getTrail;
1213    }
1214
1215fastSingle:
1216    /* fast loop for single-byte differences */
1217    /* use count as the only loop counter variable */
1218    diff=(int32_t)(sourceLimit-source);
1219    count=(int32_t)(pArgs->targetLimit-target);
1220    if(count>diff) {
1221        count=diff;
1222    }
1223    while(count>0) {
1224        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225            c=prev+(c-BOCU1_MIDDLE);
1226            if(c<0x3000) {
1227                *target++=(UChar)c;
1228                prev=BOCU1_SIMPLE_PREV(c);
1229            } else {
1230                break;
1231            }
1232        } else if(c<=0x20) {
1233            if(c!=0x20) {
1234                prev=BOCU1_ASCII_PREV;
1235            }
1236            *target++=(UChar)c;
1237        } else {
1238            break;
1239        }
1240        ++source;
1241        --count;
1242    }
1243
1244    /* decode a sequence of single and lead bytes */
1245    while(source<sourceLimit) {
1246        if(target>=targetLimit) {
1247            /* target is full */
1248            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249            break;
1250        }
1251
1252        c=*source++;
1253        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254            /* Write a code point directly from a single-byte difference. */
1255            c=prev+(c-BOCU1_MIDDLE);
1256            if(c<0x3000) {
1257                *target++=(UChar)c;
1258                prev=BOCU1_SIMPLE_PREV(c);
1259                goto fastSingle;
1260            }
1261        } else if(c<=0x20) {
1262            /*
1263             * Direct-encoded C0 control code or space.
1264             * Reset prev for C0 control codes but not for space.
1265             */
1266            if(c!=0x20) {
1267                prev=BOCU1_ASCII_PREV;
1268            }
1269            *target++=(UChar)c;
1270            continue;
1271        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272            /* Optimize two-byte case. */
1273            if(c>=BOCU1_MIDDLE) {
1274                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275            } else {
1276                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277            }
1278
1279            /* trail byte */
1280            c=decodeBocu1TrailByte(1, *source++);
1281            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282                bytes[0]=source[-2];
1283                bytes[1]=source[-1];
1284                byteIndex=2;
1285                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286                break;
1287            }
1288        } else if(c==BOCU1_RESET) {
1289            /* only reset the state, no code point */
1290            prev=BOCU1_ASCII_PREV;
1291            continue;
1292        } else {
1293            /*
1294             * For multi-byte difference lead bytes, set the decoder state
1295             * with the partial difference value from the lead byte and
1296             * with the number of trail bytes.
1297             */
1298            bytes[0]=(uint8_t)c;
1299            byteIndex=1;
1300
1301            diff=decodeBocu1LeadByte(c);
1302            count=diff&3;
1303            diff>>=2;
1304getTrail:
1305            for(;;) {
1306                if(source>=sourceLimit) {
1307                    goto endloop;
1308                }
1309                c=bytes[byteIndex++]=*source++;
1310
1311                /* trail byte in any position */
1312                c=decodeBocu1TrailByte(count, c);
1313                if(c<0) {
1314                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315                    goto endloop;
1316                }
1317
1318                diff+=c;
1319                if(--count==0) {
1320                    /* final trail byte, deliver a code point */
1321                    byteIndex=0;
1322                    c=prev+diff;
1323                    if((uint32_t)c>0x10ffff) {
1324                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325                        goto endloop;
1326                    }
1327                    break;
1328                }
1329            }
1330        }
1331
1332        /* calculate the next prev and output c */
1333        prev=BOCU1_PREV(c);
1334        if(c<=0xffff) {
1335            *target++=(UChar)c;
1336        } else {
1337            /* output surrogate pair */
1338            *target++=U16_LEAD(c);
1339            if(target<targetLimit) {
1340                *target++=U16_TRAIL(c);
1341            } else {
1342                /* target overflow */
1343                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344                cnv->UCharErrorBufferLength=1;
1345                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346                break;
1347            }
1348        }
1349    }
1350endloop:
1351
1352    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353        /* set the converter state in UConverter to deal with the next character */
1354        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355        cnv->mode=0;
1356    } else {
1357        /* set the converter state back into UConverter */
1358        cnv->toUnicodeStatus=(uint32_t)prev;
1359        cnv->mode=(diff<<2)|count;
1360    }
1361    cnv->toULength=byteIndex;
1362
1363    /* write back the updated pointers */
1364    pArgs->source=(const char *)source;
1365    pArgs->target=target;
1366    return;
1367}
1368
1369/* miscellaneous ------------------------------------------------------------ */
1370
1371static const UConverterImpl _Bocu1Impl={
1372    UCNV_BOCU1,
1373
1374    NULL,
1375    NULL,
1376
1377    NULL,
1378    NULL,
1379    NULL,
1380
1381    _Bocu1ToUnicode,
1382    _Bocu1ToUnicodeWithOffsets,
1383    _Bocu1FromUnicode,
1384    _Bocu1FromUnicodeWithOffsets,
1385    NULL,
1386
1387    NULL,
1388    NULL,
1389    NULL,
1390    NULL,
1391    ucnv_getCompleteUnicodeSet,
1392
1393    NULL,
1394    NULL
1395};
1396
1397static const UConverterStaticData _Bocu1StaticData={
1398    sizeof(UConverterStaticData),
1399    "BOCU-1",
1400    1214, /* CCSID for BOCU-1 */
1401    UCNV_IBM, UCNV_BOCU1,
1402    1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404    FALSE, FALSE,
1405    0,
1406    0,
1407    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408};
1409
1410const UConverterSharedData _Bocu1Data=
1411        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412
1413#endif
1414