1/*
2******************************************************************************
3*
4*   Copyright (C) 2002-2005, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ucnvbocu.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002mar27
14*   created by: Markus W. Scherer
15*
16*   This is an implementation of the Binary Ordered Compression for Unicode,
17*   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_CONVERSION
23
24#include "unicode/ucnv.h"
25#include "unicode/ucnv_cb.h"
26#include "ucnv_bld.h"
27#include "ucnv_cnv.h"
28
29/* BOCU-1 constants and macros ---------------------------------------------- */
30
31/*
32 * BOCU-1 encodes the code points of a Unicode string as
33 * a sequence of byte-encoded differences (slope detection),
34 * preserving lexical order.
35 *
36 * Optimize the difference-taking for runs of Unicode text within
37 * small scripts:
38 *
39 * Most small scripts are allocated within aligned 128-blocks of Unicode
40 * code points. Lexical order is preserved if the "previous code point" state
41 * is always moved into the middle of such a block.
42 *
43 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
44 * areas into the middle of those areas.
45 *
46 * C0 control codes and space are encoded with their US-ASCII bytes.
47 * "prev" is reset for C0 controls but not for space.
48 */
49
50/* initial value for "prev": middle of the ASCII range */
51#define BOCU1_ASCII_PREV        0x40
52
53/* bounding byte values for differences */
54#define BOCU1_MIN               0x21
55#define BOCU1_MIDDLE            0x90
56#define BOCU1_MAX_LEAD          0xfe
57#define BOCU1_MAX_TRAIL         0xff
58#define BOCU1_RESET             0xff
59
60/* number of lead bytes */
61#define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
62
63/* adjust trail byte counts for the use of some C0 control byte values */
64#define BOCU1_TRAIL_CONTROLS_COUNT  20
65#define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
66
67/* number of trail bytes */
68#define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
69
70/*
71 * number of positive and negative single-byte codes
72 * (counting 0==BOCU1_MIDDLE among the positive ones)
73 */
74#define BOCU1_SINGLE            64
75
76/* number of lead bytes for positive and negative 2/3/4-byte sequences */
77#define BOCU1_LEAD_2            43
78#define BOCU1_LEAD_3            3
79#define BOCU1_LEAD_4            1
80
81/* The difference value range for single-byters. */
82#define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
83#define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
84
85/* The difference value range for double-byters. */
86#define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
87#define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
88
89/* The difference value range for 3-byters. */
90#define BOCU1_REACH_POS_3   \
91    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
92
93#define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
94
95/* The lead byte start values. */
96#define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
97#define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
98#define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
99     /* ==BOCU1_MAX_LEAD */
100
101#define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
102#define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
103#define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
104     /* ==BOCU1_MIN+1 */
105
106/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
107#define BOCU1_LENGTH_FROM_LEAD(lead) \
108    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
109     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
110     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
111
112/* The length of a byte sequence, according to its packed form. */
113#define BOCU1_LENGTH_FROM_PACKED(packed) \
114    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
115
116/*
117 * 12 commonly used C0 control codes (and space) are only used to encode
118 * themselves directly,
119 * which makes BOCU-1 MIME-usable and reasonably safe for
120 * ASCII-oriented software.
121 *
122 * These controls are
123 *  0   NUL
124 *
125 *  7   BEL
126 *  8   BS
127 *
128 *  9   TAB
129 *  a   LF
130 *  b   VT
131 *  c   FF
132 *  d   CR
133 *
134 *  e   SO
135 *  f   SI
136 *
137 * 1a   SUB
138 * 1b   ESC
139 *
140 * The other 20 C0 controls are also encoded directly (to preserve order)
141 * but are also used as trail bytes in difference encoding
142 * (for better compression).
143 */
144#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
145
146/*
147 * Byte value map for control codes,
148 * from external byte values 0x00..0x20
149 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
150 * External byte values that are illegal as trail bytes are mapped to -1.
151 */
152static const int8_t
153bocu1ByteToTrail[BOCU1_MIN]={
154/*  0     1     2     3     4     5     6     7    */
155    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
156
157/*  8     9     a     b     c     d     e     f    */
158    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
159
160/*  10    11    12    13    14    15    16    17   */
161    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
162
163/*  18    19    1a    1b    1c    1d    1e    1f   */
164    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
165
166/*  20   */
167    -1
168};
169
170/*
171 * Byte value map for control codes,
172 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
173 * to external byte values 0x00..0x20.
174 */
175static const int8_t
176bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
177/*  0     1     2     3     4     5     6     7    */
178    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
179
180/*  8     9     a     b     c     d     e     f    */
181    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
182
183/*  10    11    12    13   */
184    0x1c, 0x1d, 0x1e, 0x1f
185};
186
187/**
188 * Integer division and modulo with negative numerators
189 * yields negative modulo results and quotients that are one more than
190 * what we need here.
191 * This macro adjust the results so that the modulo-value m is always >=0.
192 *
193 * For positive n, the if() condition is always FALSE.
194 *
195 * @param n Number to be split into quotient and rest.
196 *          Will be modified to contain the quotient.
197 * @param d Divisor.
198 * @param m Output variable for the rest (modulo result).
199 */
200#define NEGDIVMOD(n, d, m) { \
201    (m)=(n)%(d); \
202    (n)/=(d); \
203    if((m)<0) { \
204        --(n); \
205        (m)+=(d); \
206    } \
207}
208
209/* BOCU-1 implementation functions ------------------------------------------ */
210
211#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
212
213/**
214 * Compute the next "previous" value for differencing
215 * from the current code point.
216 *
217 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
218 * @return "previous code point" state value
219 */
220static U_INLINE int32_t
221bocu1Prev(int32_t c) {
222    /* compute new prev */
223    if(/* 0x3040<=c && */ c<=0x309f) {
224        /* Hiragana is not 128-aligned */
225        return 0x3070;
226    } else if(0x4e00<=c && c<=0x9fa5) {
227        /* CJK Unihan */
228        return 0x4e00-BOCU1_REACH_NEG_2;
229    } else if(0xac00<=c /* && c<=0xd7a3 */) {
230        /* Korean Hangul */
231        return (0xd7a3+0xac00)/2;
232    } else {
233        /* mostly small scripts */
234        return BOCU1_SIMPLE_PREV(c);
235    }
236}
237
238/** Fast version of bocu1Prev() for most scripts. */
239#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
240
241/*
242 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
243 * The UConverter fields are used as follows:
244 *
245 * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
246 *
247 * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
248 * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
249 */
250
251/* BOCU-1-from-Unicode conversion functions --------------------------------- */
252
253/**
254 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
255 * and return a packed integer with them.
256 *
257 * The encoding favors small absolut differences with short encodings
258 * to compress runs of same-script characters.
259 *
260 * Optimized version with unrolled loops and fewer floating-point operations
261 * than the standard packDiff().
262 *
263 * @param diff difference value -0x10ffff..0x10ffff
264 * @return
265 *      0x010000zz for 1-byte sequence zz
266 *      0x0200yyzz for 2-byte sequence yy zz
267 *      0x03xxyyzz for 3-byte sequence xx yy zz
268 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
269 */
270static int32_t
271packDiff(int32_t diff) {
272    int32_t result, m;
273
274    if(diff>=BOCU1_REACH_NEG_1) {
275        /* mostly positive differences, and single-byte negative ones */
276#if 0   /* single-byte case handled in macros, see below */
277        if(diff<=BOCU1_REACH_POS_1) {
278            /* single byte */
279            return 0x01000000|(BOCU1_MIDDLE+diff);
280        } else
281#endif
282        if(diff<=BOCU1_REACH_POS_2) {
283            /* two bytes */
284            diff-=BOCU1_REACH_POS_1+1;
285            result=0x02000000;
286
287            m=diff%BOCU1_TRAIL_COUNT;
288            diff/=BOCU1_TRAIL_COUNT;
289            result|=BOCU1_TRAIL_TO_BYTE(m);
290
291            result|=(BOCU1_START_POS_2+diff)<<8;
292        } else if(diff<=BOCU1_REACH_POS_3) {
293            /* three bytes */
294            diff-=BOCU1_REACH_POS_2+1;
295            result=0x03000000;
296
297            m=diff%BOCU1_TRAIL_COUNT;
298            diff/=BOCU1_TRAIL_COUNT;
299            result|=BOCU1_TRAIL_TO_BYTE(m);
300
301            m=diff%BOCU1_TRAIL_COUNT;
302            diff/=BOCU1_TRAIL_COUNT;
303            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
304
305            result|=(BOCU1_START_POS_3+diff)<<16;
306        } else {
307            /* four bytes */
308            diff-=BOCU1_REACH_POS_3+1;
309
310            m=diff%BOCU1_TRAIL_COUNT;
311            diff/=BOCU1_TRAIL_COUNT;
312            result=BOCU1_TRAIL_TO_BYTE(m);
313
314            m=diff%BOCU1_TRAIL_COUNT;
315            diff/=BOCU1_TRAIL_COUNT;
316            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
317
318            /*
319             * We know that / and % would deliver quotient 0 and rest=diff.
320             * Avoid division and modulo for performance.
321             */
322            result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
323
324            result|=((uint32_t)BOCU1_START_POS_4)<<24;
325        }
326    } else {
327        /* two- to four-byte negative differences */
328        if(diff>=BOCU1_REACH_NEG_2) {
329            /* two bytes */
330            diff-=BOCU1_REACH_NEG_1;
331            result=0x02000000;
332
333            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
334            result|=BOCU1_TRAIL_TO_BYTE(m);
335
336            result|=(BOCU1_START_NEG_2+diff)<<8;
337        } else if(diff>=BOCU1_REACH_NEG_3) {
338            /* three bytes */
339            diff-=BOCU1_REACH_NEG_2;
340            result=0x03000000;
341
342            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
343            result|=BOCU1_TRAIL_TO_BYTE(m);
344
345            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
346            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
347
348            result|=(BOCU1_START_NEG_3+diff)<<16;
349        } else {
350            /* four bytes */
351            diff-=BOCU1_REACH_NEG_3;
352
353            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
354            result=BOCU1_TRAIL_TO_BYTE(m);
355
356            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
357            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
358
359            /*
360             * We know that NEGDIVMOD would deliver
361             * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
362             * Avoid division and modulo for performance.
363             */
364            m=diff+BOCU1_TRAIL_COUNT;
365            result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
366
367            result|=BOCU1_MIN<<24;
368        }
369    }
370    return result;
371}
372
373/* Faster versions of packDiff() for single-byte-encoded diff values. */
374
375/** Is a diff value encodable in a single byte? */
376#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
377
378/** Encode a diff value in a single byte. */
379#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
380
381/** Is a diff value encodable in two bytes? */
382#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
383
384static void
385_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
386                             UErrorCode *pErrorCode) {
387    UConverter *cnv;
388    const UChar *source, *sourceLimit;
389    uint8_t *target;
390    int32_t targetCapacity;
391    int32_t *offsets;
392
393    int32_t prev, c, diff;
394
395    int32_t sourceIndex, nextSourceIndex;
396
397U_ALIGN_CODE(16)
398
399    /* set up the local pointers */
400    cnv=pArgs->converter;
401    source=pArgs->source;
402    sourceLimit=pArgs->sourceLimit;
403    target=(uint8_t *)pArgs->target;
404    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
405    offsets=pArgs->offsets;
406
407    /* get the converter state from UConverter */
408    c=cnv->fromUChar32;
409    prev=(int32_t)cnv->fromUnicodeStatus;
410    if(prev==0) {
411        prev=BOCU1_ASCII_PREV;
412    }
413
414    /* sourceIndex=-1 if the current character began in the previous buffer */
415    sourceIndex= c==0 ? 0 : -1;
416    nextSourceIndex=0;
417
418    /* conversion loop */
419    if(c!=0 && targetCapacity>0) {
420        goto getTrail;
421    }
422
423fastSingle:
424    /* fast loop for single-byte differences */
425    /* use only one loop counter variable, targetCapacity, not also source */
426    diff=(int32_t)(sourceLimit-source);
427    if(targetCapacity>diff) {
428        targetCapacity=diff;
429    }
430    while(targetCapacity>0 && (c=*source)<0x3000) {
431        if(c<=0x20) {
432            if(c!=0x20) {
433                prev=BOCU1_ASCII_PREV;
434            }
435            *target++=(uint8_t)c;
436            *offsets++=nextSourceIndex++;
437            ++source;
438            --targetCapacity;
439        } else {
440            diff=c-prev;
441            if(DIFF_IS_SINGLE(diff)) {
442                prev=BOCU1_SIMPLE_PREV(c);
443                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
444                *offsets++=nextSourceIndex++;
445                ++source;
446                --targetCapacity;
447            } else {
448                break;
449            }
450        }
451    }
452    /* restore real values */
453    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
454    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
455
456    /* regular loop for all cases */
457    while(source<sourceLimit) {
458        if(targetCapacity>0) {
459            c=*source++;
460            ++nextSourceIndex;
461
462            if(c<=0x20) {
463                /*
464                 * ISO C0 control & space:
465                 * Encode directly for MIME compatibility,
466                 * and reset state except for space, to not disrupt compression.
467                 */
468                if(c!=0x20) {
469                    prev=BOCU1_ASCII_PREV;
470                }
471                *target++=(uint8_t)c;
472                *offsets++=sourceIndex;
473                --targetCapacity;
474
475                sourceIndex=nextSourceIndex;
476                continue;
477            }
478
479            if(UTF_IS_LEAD(c)) {
480getTrail:
481                if(source<sourceLimit) {
482                    /* test the following code unit */
483                    UChar trail=*source;
484                    if(UTF_IS_SECOND_SURROGATE(trail)) {
485                        ++source;
486                        ++nextSourceIndex;
487                        c=UTF16_GET_PAIR_VALUE(c, trail);
488                    }
489                } else {
490                    /* no more input */
491                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
492                    break;
493                }
494            }
495
496            /*
497             * all other Unicode code points c==U+0021..U+10ffff
498             * are encoded with the difference c-prev
499             *
500             * a new prev is computed from c,
501             * placed in the middle of a 0x80-block (for most small scripts) or
502             * in the middle of the Unihan and Hangul blocks
503             * to statistically minimize the following difference
504             */
505            diff=c-prev;
506            prev=BOCU1_PREV(c);
507            if(DIFF_IS_SINGLE(diff)) {
508                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
509                *offsets++=sourceIndex;
510                --targetCapacity;
511                sourceIndex=nextSourceIndex;
512                if(c<0x3000) {
513                    goto fastSingle;
514                }
515            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
516                /* optimize 2-byte case */
517                int32_t m;
518
519                if(diff>=0) {
520                    diff-=BOCU1_REACH_POS_1+1;
521                    m=diff%BOCU1_TRAIL_COUNT;
522                    diff/=BOCU1_TRAIL_COUNT;
523                    diff+=BOCU1_START_POS_2;
524                } else {
525                    diff-=BOCU1_REACH_NEG_1;
526                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
527                    diff+=BOCU1_START_NEG_2;
528                }
529                *target++=(uint8_t)diff;
530                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
531                *offsets++=sourceIndex;
532                *offsets++=sourceIndex;
533                targetCapacity-=2;
534                sourceIndex=nextSourceIndex;
535            } else {
536                int32_t length; /* will be 2..4 */
537
538                diff=packDiff(diff);
539                length=BOCU1_LENGTH_FROM_PACKED(diff);
540
541                /* write the output character bytes from diff and length */
542                /* from the first if in the loop we know that targetCapacity>0 */
543                if(length<=targetCapacity) {
544                    switch(length) {
545                        /* each branch falls through to the next one */
546                    case 4:
547                        *target++=(uint8_t)(diff>>24);
548                        *offsets++=sourceIndex;
549                    case 3:
550                        *target++=(uint8_t)(diff>>16);
551                        *offsets++=sourceIndex;
552                    case 2:
553                        *target++=(uint8_t)(diff>>8);
554                        *offsets++=sourceIndex;
555                    /* case 1: handled above */
556                        *target++=(uint8_t)diff;
557                        *offsets++=sourceIndex;
558                    default:
559                        /* will never occur */
560                        break;
561                    }
562                    targetCapacity-=length;
563                    sourceIndex=nextSourceIndex;
564                } else {
565                    uint8_t *charErrorBuffer;
566
567                    /*
568                     * We actually do this backwards here:
569                     * In order to save an intermediate variable, we output
570                     * first to the overflow buffer what does not fit into the
571                     * regular target.
572                     */
573                    /* we know that 1<=targetCapacity<length<=4 */
574                    length-=targetCapacity;
575                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
576                    switch(length) {
577                        /* each branch falls through to the next one */
578                    case 3:
579                        *charErrorBuffer++=(uint8_t)(diff>>16);
580                    case 2:
581                        *charErrorBuffer++=(uint8_t)(diff>>8);
582                    case 1:
583                        *charErrorBuffer=(uint8_t)diff;
584                    default:
585                        /* will never occur */
586                        break;
587                    }
588                    cnv->charErrorBufferLength=(int8_t)length;
589
590                    /* now output what fits into the regular target */
591                    diff>>=8*length; /* length was reduced by targetCapacity */
592                    switch(targetCapacity) {
593                        /* each branch falls through to the next one */
594                    case 3:
595                        *target++=(uint8_t)(diff>>16);
596                        *offsets++=sourceIndex;
597                    case 2:
598                        *target++=(uint8_t)(diff>>8);
599                        *offsets++=sourceIndex;
600                    case 1:
601                        *target++=(uint8_t)diff;
602                        *offsets++=sourceIndex;
603                    default:
604                        /* will never occur */
605                        break;
606                    }
607
608                    /* target overflow */
609                    targetCapacity=0;
610                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
611                    break;
612                }
613            }
614        } else {
615            /* target is full */
616            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
617            break;
618        }
619    }
620
621    /* set the converter state back into UConverter */
622    cnv->fromUChar32= c<0 ? -c : 0;
623    cnv->fromUnicodeStatus=(uint32_t)prev;
624
625    /* write back the updated pointers */
626    pArgs->source=source;
627    pArgs->target=(char *)target;
628    pArgs->offsets=offsets;
629}
630
631/*
632 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
633 * If a change is made in the original function, then either
634 * change this function the same way or
635 * re-copy the original function and remove the variables
636 * offsets, sourceIndex, and nextSourceIndex.
637 */
638static void
639_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
640                  UErrorCode *pErrorCode) {
641    UConverter *cnv;
642    const UChar *source, *sourceLimit;
643    uint8_t *target;
644    int32_t targetCapacity;
645
646    int32_t prev, c, diff;
647
648    /* set up the local pointers */
649    cnv=pArgs->converter;
650    source=pArgs->source;
651    sourceLimit=pArgs->sourceLimit;
652    target=(uint8_t *)pArgs->target;
653    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
654
655    /* get the converter state from UConverter */
656    c=cnv->fromUChar32;
657    prev=(int32_t)cnv->fromUnicodeStatus;
658    if(prev==0) {
659        prev=BOCU1_ASCII_PREV;
660    }
661
662    /* conversion loop */
663    if(c!=0 && targetCapacity>0) {
664        goto getTrail;
665    }
666
667fastSingle:
668    /* fast loop for single-byte differences */
669    /* use only one loop counter variable, targetCapacity, not also source */
670    diff=(int32_t)(sourceLimit-source);
671    if(targetCapacity>diff) {
672        targetCapacity=diff;
673    }
674    while(targetCapacity>0 && (c=*source)<0x3000) {
675        if(c<=0x20) {
676            if(c!=0x20) {
677                prev=BOCU1_ASCII_PREV;
678            }
679            *target++=(uint8_t)c;
680        } else {
681            diff=c-prev;
682            if(DIFF_IS_SINGLE(diff)) {
683                prev=BOCU1_SIMPLE_PREV(c);
684                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
685            } else {
686                break;
687            }
688        }
689        ++source;
690        --targetCapacity;
691    }
692    /* restore real values */
693    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
694
695    /* regular loop for all cases */
696    while(source<sourceLimit) {
697        if(targetCapacity>0) {
698            c=*source++;
699
700            if(c<=0x20) {
701                /*
702                 * ISO C0 control & space:
703                 * Encode directly for MIME compatibility,
704                 * and reset state except for space, to not disrupt compression.
705                 */
706                if(c!=0x20) {
707                    prev=BOCU1_ASCII_PREV;
708                }
709                *target++=(uint8_t)c;
710                --targetCapacity;
711                continue;
712            }
713
714            if(UTF_IS_LEAD(c)) {
715getTrail:
716                if(source<sourceLimit) {
717                    /* test the following code unit */
718                    UChar trail=*source;
719                    if(UTF_IS_SECOND_SURROGATE(trail)) {
720                        ++source;
721                        c=UTF16_GET_PAIR_VALUE(c, trail);
722                    }
723                } else {
724                    /* no more input */
725                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
726                    break;
727                }
728            }
729
730            /*
731             * all other Unicode code points c==U+0021..U+10ffff
732             * are encoded with the difference c-prev
733             *
734             * a new prev is computed from c,
735             * placed in the middle of a 0x80-block (for most small scripts) or
736             * in the middle of the Unihan and Hangul blocks
737             * to statistically minimize the following difference
738             */
739            diff=c-prev;
740            prev=BOCU1_PREV(c);
741            if(DIFF_IS_SINGLE(diff)) {
742                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
743                --targetCapacity;
744                if(c<0x3000) {
745                    goto fastSingle;
746                }
747            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
748                /* optimize 2-byte case */
749                int32_t m;
750
751                if(diff>=0) {
752                    diff-=BOCU1_REACH_POS_1+1;
753                    m=diff%BOCU1_TRAIL_COUNT;
754                    diff/=BOCU1_TRAIL_COUNT;
755                    diff+=BOCU1_START_POS_2;
756                } else {
757                    diff-=BOCU1_REACH_NEG_1;
758                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
759                    diff+=BOCU1_START_NEG_2;
760                }
761                *target++=(uint8_t)diff;
762                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
763                targetCapacity-=2;
764            } else {
765                int32_t length; /* will be 2..4 */
766
767                diff=packDiff(diff);
768                length=BOCU1_LENGTH_FROM_PACKED(diff);
769
770                /* write the output character bytes from diff and length */
771                /* from the first if in the loop we know that targetCapacity>0 */
772                if(length<=targetCapacity) {
773                    switch(length) {
774                        /* each branch falls through to the next one */
775                    case 4:
776                        *target++=(uint8_t)(diff>>24);
777                    case 3:
778                        *target++=(uint8_t)(diff>>16);
779                    /* case 2: handled above */
780                        *target++=(uint8_t)(diff>>8);
781                    /* case 1: handled above */
782                        *target++=(uint8_t)diff;
783                    default:
784                        /* will never occur */
785                        break;
786                    }
787                    targetCapacity-=length;
788                } else {
789                    uint8_t *charErrorBuffer;
790
791                    /*
792                     * We actually do this backwards here:
793                     * In order to save an intermediate variable, we output
794                     * first to the overflow buffer what does not fit into the
795                     * regular target.
796                     */
797                    /* we know that 1<=targetCapacity<length<=4 */
798                    length-=targetCapacity;
799                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
800                    switch(length) {
801                        /* each branch falls through to the next one */
802                    case 3:
803                        *charErrorBuffer++=(uint8_t)(diff>>16);
804                    case 2:
805                        *charErrorBuffer++=(uint8_t)(diff>>8);
806                    case 1:
807                        *charErrorBuffer=(uint8_t)diff;
808                    default:
809                        /* will never occur */
810                        break;
811                    }
812                    cnv->charErrorBufferLength=(int8_t)length;
813
814                    /* now output what fits into the regular target */
815                    diff>>=8*length; /* length was reduced by targetCapacity */
816                    switch(targetCapacity) {
817                        /* each branch falls through to the next one */
818                    case 3:
819                        *target++=(uint8_t)(diff>>16);
820                    case 2:
821                        *target++=(uint8_t)(diff>>8);
822                    case 1:
823                        *target++=(uint8_t)diff;
824                    default:
825                        /* will never occur */
826                        break;
827                    }
828
829                    /* target overflow */
830                    targetCapacity=0;
831                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
832                    break;
833                }
834            }
835        } else {
836            /* target is full */
837            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
838            break;
839        }
840    }
841
842    /* set the converter state back into UConverter */
843    cnv->fromUChar32= c<0 ? -c : 0;
844    cnv->fromUnicodeStatus=(uint32_t)prev;
845
846    /* write back the updated pointers */
847    pArgs->source=source;
848    pArgs->target=(char *)target;
849}
850
851/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
852
853/**
854 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
855 *
856 * @param b lead byte;
857 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
858 * @return (diff<<2)|count
859 */
860static U_INLINE int32_t
861decodeBocu1LeadByte(int32_t b) {
862    int32_t diff, count;
863
864    if(b>=BOCU1_START_NEG_2) {
865        /* positive difference */
866        if(b<BOCU1_START_POS_3) {
867            /* two bytes */
868            diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
869            count=1;
870        } else if(b<BOCU1_START_POS_4) {
871            /* three bytes */
872            diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
873            count=2;
874        } else {
875            /* four bytes */
876            diff=BOCU1_REACH_POS_3+1;
877            count=3;
878        }
879    } else {
880        /* negative difference */
881        if(b>=BOCU1_START_NEG_3) {
882            /* two bytes */
883            diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
884            count=1;
885        } else if(b>BOCU1_MIN) {
886            /* three bytes */
887            diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
888            count=2;
889        } else {
890            /* four bytes */
891            diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
892            count=3;
893        }
894    }
895
896    /* return the state for decoding the trail byte(s) */
897    return (diff<<2)|count;
898}
899
900/**
901 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
902 *
903 * @param count number of remaining trail bytes including this one
904 * @param b trail byte
905 * @return new delta for diff including b - <0 indicates an error
906 *
907 * @see decodeBocu1
908 */
909static U_INLINE int32_t
910decodeBocu1TrailByte(int32_t count, int32_t b) {
911    if(b<=0x20) {
912        /* skip some C0 controls and make the trail byte range contiguous */
913        b=bocu1ByteToTrail[b];
914        /* b<0 for an illegal trail byte value will result in return<0 below */
915#if BOCU1_MAX_TRAIL<0xff
916    } else if(b>BOCU1_MAX_TRAIL) {
917        return -99;
918#endif
919    } else {
920        b-=BOCU1_TRAIL_BYTE_OFFSET;
921    }
922
923    /* add trail byte into difference and decrement count */
924    if(count==1) {
925        return b;
926    } else if(count==2) {
927        return b*BOCU1_TRAIL_COUNT;
928    } else /* count==3 */ {
929        return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
930    }
931}
932
933static void
934_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
935                           UErrorCode *pErrorCode) {
936    UConverter *cnv;
937    const uint8_t *source, *sourceLimit;
938    UChar *target;
939    const UChar *targetLimit;
940    int32_t *offsets;
941
942    int32_t prev, count, diff, c;
943
944    int8_t byteIndex;
945    uint8_t *bytes;
946
947    int32_t sourceIndex, nextSourceIndex;
948
949    /* set up the local pointers */
950    cnv=pArgs->converter;
951    source=(const uint8_t *)pArgs->source;
952    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
953    target=pArgs->target;
954    targetLimit=pArgs->targetLimit;
955    offsets=pArgs->offsets;
956
957    /* get the converter state from UConverter */
958    prev=(int32_t)cnv->toUnicodeStatus;
959    if(prev==0) {
960        prev=BOCU1_ASCII_PREV;
961    }
962    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
963    count=diff&3;
964    diff>>=2;
965
966    byteIndex=cnv->toULength;
967    bytes=cnv->toUBytes;
968
969    /* sourceIndex=-1 if the current character began in the previous buffer */
970    sourceIndex=byteIndex==0 ? 0 : -1;
971    nextSourceIndex=0;
972
973    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
974    if(count>0 && byteIndex>0 && target<targetLimit) {
975        goto getTrail;
976    }
977
978fastSingle:
979    /* fast loop for single-byte differences */
980    /* use count as the only loop counter variable */
981    diff=(int32_t)(sourceLimit-source);
982    count=(int32_t)(pArgs->targetLimit-target);
983    if(count>diff) {
984        count=diff;
985    }
986    while(count>0) {
987        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
988            c=prev+(c-BOCU1_MIDDLE);
989            if(c<0x3000) {
990                *target++=(UChar)c;
991                *offsets++=nextSourceIndex++;
992                prev=BOCU1_SIMPLE_PREV(c);
993            } else {
994                break;
995            }
996        } else if(c<=0x20) {
997            if(c!=0x20) {
998                prev=BOCU1_ASCII_PREV;
999            }
1000            *target++=(UChar)c;
1001            *offsets++=nextSourceIndex++;
1002        } else {
1003            break;
1004        }
1005        ++source;
1006        --count;
1007    }
1008    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1009
1010    /* decode a sequence of single and lead bytes */
1011    while(source<sourceLimit) {
1012        if(target>=targetLimit) {
1013            /* target is full */
1014            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1015            break;
1016        }
1017
1018        ++nextSourceIndex;
1019        c=*source++;
1020        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1021            /* Write a code point directly from a single-byte difference. */
1022            c=prev+(c-BOCU1_MIDDLE);
1023            if(c<0x3000) {
1024                *target++=(UChar)c;
1025                *offsets++=sourceIndex;
1026                prev=BOCU1_SIMPLE_PREV(c);
1027                sourceIndex=nextSourceIndex;
1028                goto fastSingle;
1029            }
1030        } else if(c<=0x20) {
1031            /*
1032             * Direct-encoded C0 control code or space.
1033             * Reset prev for C0 control codes but not for space.
1034             */
1035            if(c!=0x20) {
1036                prev=BOCU1_ASCII_PREV;
1037            }
1038            *target++=(UChar)c;
1039            *offsets++=sourceIndex;
1040            sourceIndex=nextSourceIndex;
1041            continue;
1042        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1043            /* Optimize two-byte case. */
1044            if(c>=BOCU1_MIDDLE) {
1045                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1046            } else {
1047                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1048            }
1049
1050            /* trail byte */
1051            ++nextSourceIndex;
1052            c=decodeBocu1TrailByte(1, *source++);
1053            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1054                bytes[0]=source[-2];
1055                bytes[1]=source[-1];
1056                byteIndex=2;
1057                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1058                break;
1059            }
1060        } else if(c==BOCU1_RESET) {
1061            /* only reset the state, no code point */
1062            prev=BOCU1_ASCII_PREV;
1063            sourceIndex=nextSourceIndex;
1064            continue;
1065        } else {
1066            /*
1067             * For multi-byte difference lead bytes, set the decoder state
1068             * with the partial difference value from the lead byte and
1069             * with the number of trail bytes.
1070             */
1071            bytes[0]=(uint8_t)c;
1072            byteIndex=1;
1073
1074            diff=decodeBocu1LeadByte(c);
1075            count=diff&3;
1076            diff>>=2;
1077getTrail:
1078            for(;;) {
1079                if(source>=sourceLimit) {
1080                    goto endloop;
1081                }
1082                ++nextSourceIndex;
1083                c=bytes[byteIndex++]=*source++;
1084
1085                /* trail byte in any position */
1086                c=decodeBocu1TrailByte(count, c);
1087                if(c<0) {
1088                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1089                    goto endloop;
1090                }
1091
1092                diff+=c;
1093                if(--count==0) {
1094                    /* final trail byte, deliver a code point */
1095                    byteIndex=0;
1096                    c=prev+diff;
1097                    if((uint32_t)c>0x10ffff) {
1098                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1099                        goto endloop;
1100                    }
1101                    break;
1102                }
1103            }
1104        }
1105
1106        /* calculate the next prev and output c */
1107        prev=BOCU1_PREV(c);
1108        if(c<=0xffff) {
1109            *target++=(UChar)c;
1110            *offsets++=sourceIndex;
1111        } else {
1112            /* output surrogate pair */
1113            *target++=UTF16_LEAD(c);
1114            if(target<targetLimit) {
1115                *target++=UTF16_TRAIL(c);
1116                *offsets++=sourceIndex;
1117                *offsets++=sourceIndex;
1118            } else {
1119                /* target overflow */
1120                *offsets++=sourceIndex;
1121                cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1122                cnv->UCharErrorBufferLength=1;
1123                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1124                break;
1125            }
1126        }
1127        sourceIndex=nextSourceIndex;
1128    }
1129endloop:
1130
1131    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1132        /* set the converter state in UConverter to deal with the next character */
1133        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1134        cnv->mode=0;
1135    } else {
1136        /* set the converter state back into UConverter */
1137        cnv->toUnicodeStatus=(uint32_t)prev;
1138        cnv->mode=(diff<<2)|count;
1139    }
1140    cnv->toULength=byteIndex;
1141
1142    /* write back the updated pointers */
1143    pArgs->source=(const char *)source;
1144    pArgs->target=target;
1145    pArgs->offsets=offsets;
1146    return;
1147}
1148
1149/*
1150 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1151 * If a change is made in the original function, then either
1152 * change this function the same way or
1153 * re-copy the original function and remove the variables
1154 * offsets, sourceIndex, and nextSourceIndex.
1155 */
1156static void
1157_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1158                UErrorCode *pErrorCode) {
1159    UConverter *cnv;
1160    const uint8_t *source, *sourceLimit;
1161    UChar *target;
1162    const UChar *targetLimit;
1163
1164    int32_t prev, count, diff, c;
1165
1166    int8_t byteIndex;
1167    uint8_t *bytes;
1168
1169U_ALIGN_CODE(16)
1170
1171    /* set up the local pointers */
1172    cnv=pArgs->converter;
1173    source=(const uint8_t *)pArgs->source;
1174    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1175    target=pArgs->target;
1176    targetLimit=pArgs->targetLimit;
1177
1178    /* get the converter state from UConverter */
1179    prev=(int32_t)cnv->toUnicodeStatus;
1180    if(prev==0) {
1181        prev=BOCU1_ASCII_PREV;
1182    }
1183    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1184    count=diff&3;
1185    diff>>=2;
1186
1187    byteIndex=cnv->toULength;
1188    bytes=cnv->toUBytes;
1189
1190    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1191    if(count>0 && byteIndex>0 && target<targetLimit) {
1192        goto getTrail;
1193    }
1194
1195fastSingle:
1196    /* fast loop for single-byte differences */
1197    /* use count as the only loop counter variable */
1198    diff=(int32_t)(sourceLimit-source);
1199    count=(int32_t)(pArgs->targetLimit-target);
1200    if(count>diff) {
1201        count=diff;
1202    }
1203    while(count>0) {
1204        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1205            c=prev+(c-BOCU1_MIDDLE);
1206            if(c<0x3000) {
1207                *target++=(UChar)c;
1208                prev=BOCU1_SIMPLE_PREV(c);
1209            } else {
1210                break;
1211            }
1212        } else if(c<=0x20) {
1213            if(c!=0x20) {
1214                prev=BOCU1_ASCII_PREV;
1215            }
1216            *target++=(UChar)c;
1217        } else {
1218            break;
1219        }
1220        ++source;
1221        --count;
1222    }
1223
1224    /* decode a sequence of single and lead bytes */
1225    while(source<sourceLimit) {
1226        if(target>=targetLimit) {
1227            /* target is full */
1228            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1229            break;
1230        }
1231
1232        c=*source++;
1233        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1234            /* Write a code point directly from a single-byte difference. */
1235            c=prev+(c-BOCU1_MIDDLE);
1236            if(c<0x3000) {
1237                *target++=(UChar)c;
1238                prev=BOCU1_SIMPLE_PREV(c);
1239                goto fastSingle;
1240            }
1241        } else if(c<=0x20) {
1242            /*
1243             * Direct-encoded C0 control code or space.
1244             * Reset prev for C0 control codes but not for space.
1245             */
1246            if(c!=0x20) {
1247                prev=BOCU1_ASCII_PREV;
1248            }
1249            *target++=(UChar)c;
1250            continue;
1251        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1252            /* Optimize two-byte case. */
1253            if(c>=BOCU1_MIDDLE) {
1254                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1255            } else {
1256                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1257            }
1258
1259            /* trail byte */
1260            c=decodeBocu1TrailByte(1, *source++);
1261            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1262                bytes[0]=source[-2];
1263                bytes[1]=source[-1];
1264                byteIndex=2;
1265                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1266                break;
1267            }
1268        } else if(c==BOCU1_RESET) {
1269            /* only reset the state, no code point */
1270            prev=BOCU1_ASCII_PREV;
1271            continue;
1272        } else {
1273            /*
1274             * For multi-byte difference lead bytes, set the decoder state
1275             * with the partial difference value from the lead byte and
1276             * with the number of trail bytes.
1277             */
1278            bytes[0]=(uint8_t)c;
1279            byteIndex=1;
1280
1281            diff=decodeBocu1LeadByte(c);
1282            count=diff&3;
1283            diff>>=2;
1284getTrail:
1285            for(;;) {
1286                if(source>=sourceLimit) {
1287                    goto endloop;
1288                }
1289                c=bytes[byteIndex++]=*source++;
1290
1291                /* trail byte in any position */
1292                c=decodeBocu1TrailByte(count, c);
1293                if(c<0) {
1294                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1295                    goto endloop;
1296                }
1297
1298                diff+=c;
1299                if(--count==0) {
1300                    /* final trail byte, deliver a code point */
1301                    byteIndex=0;
1302                    c=prev+diff;
1303                    if((uint32_t)c>0x10ffff) {
1304                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1305                        goto endloop;
1306                    }
1307                    break;
1308                }
1309            }
1310        }
1311
1312        /* calculate the next prev and output c */
1313        prev=BOCU1_PREV(c);
1314        if(c<=0xffff) {
1315            *target++=(UChar)c;
1316        } else {
1317            /* output surrogate pair */
1318            *target++=UTF16_LEAD(c);
1319            if(target<targetLimit) {
1320                *target++=UTF16_TRAIL(c);
1321            } else {
1322                /* target overflow */
1323                cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1324                cnv->UCharErrorBufferLength=1;
1325                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1326                break;
1327            }
1328        }
1329    }
1330endloop:
1331
1332    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1333        /* set the converter state in UConverter to deal with the next character */
1334        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1335        cnv->mode=0;
1336    } else {
1337        /* set the converter state back into UConverter */
1338        cnv->toUnicodeStatus=(uint32_t)prev;
1339        cnv->mode=(diff<<2)|count;
1340    }
1341    cnv->toULength=byteIndex;
1342
1343    /* write back the updated pointers */
1344    pArgs->source=(const char *)source;
1345    pArgs->target=target;
1346    return;
1347}
1348
1349/* miscellaneous ------------------------------------------------------------ */
1350
1351static const UConverterImpl _Bocu1Impl={
1352    UCNV_BOCU1,
1353
1354    NULL,
1355    NULL,
1356
1357    NULL,
1358    NULL,
1359    NULL,
1360
1361    _Bocu1ToUnicode,
1362    _Bocu1ToUnicodeWithOffsets,
1363    _Bocu1FromUnicode,
1364    _Bocu1FromUnicodeWithOffsets,
1365    NULL,
1366
1367    NULL,
1368    NULL,
1369    NULL,
1370    NULL,
1371    ucnv_getCompleteUnicodeSet
1372};
1373
1374static const UConverterStaticData _Bocu1StaticData={
1375    sizeof(UConverterStaticData),
1376    "BOCU-1",
1377    1214, /* CCSID for BOCU-1 */
1378    UCNV_IBM, UCNV_BOCU1,
1379    1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1380    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1381    FALSE, FALSE,
1382    0,
1383    0,
1384    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1385};
1386
1387const UConverterSharedData _Bocu1Data={
1388    sizeof(UConverterSharedData), ~((uint32_t)0),
1389    NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1390    0
1391};
1392
1393#endif
1394