1/*
2******************************************************************************
3*
4*   Copyright (C) 2002-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  bocu1tst.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002may27
14*   created by: Markus W. Scherer
15*
16*   This is the reference implementation of BOCU-1,
17*   the MIME-friendly form of the Binary Ordered Compression for Unicode,
18*   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
19*   The files bocu1.h and bocu1.c from the design folder are taken
20*   verbatim (minus copyright and #include) and copied together into this file.
21*   The reference code and some of the reference bocu1tst.c
22*   is modified to run as part of the ICU cintltst
23*   test framework (minus main(), log_ln() etc. instead of printf()).
24*
25*   This reference implementation is used here to verify
26*   the ICU BOCU-1 implementation, which is
27*   adapted for ICU conversion APIs and optimized.
28*   ### links in design doc to here and to ucnvbocu.c
29*/
30
31#include "unicode/utypes.h"
32#include "unicode/ustring.h"
33#include "unicode/ucnv.h"
34#include "unicode/utf16.h"
35#include "cmemory.h"
36#include "cintltst.h"
37
38#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
39
40/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
41
42/* BOCU-1 constants and macros ---------------------------------------------- */
43
44/*
45 * BOCU-1 encodes the code points of a Unicode string as
46 * a sequence of byte-encoded differences (slope detection),
47 * preserving lexical order.
48 *
49 * Optimize the difference-taking for runs of Unicode text within
50 * small scripts:
51 *
52 * Most small scripts are allocated within aligned 128-blocks of Unicode
53 * code points. Lexical order is preserved if the "previous code point" state
54 * is always moved into the middle of such a block.
55 *
56 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
57 * areas into the middle of those areas.
58 *
59 * C0 control codes and space are encoded with their US-ASCII bytes.
60 * "prev" is reset for C0 controls but not for space.
61 */
62
63/* initial value for "prev": middle of the ASCII range */
64#define BOCU1_ASCII_PREV        0x40
65
66/* bounding byte values for differences */
67#define BOCU1_MIN               0x21
68#define BOCU1_MIDDLE            0x90
69#define BOCU1_MAX_LEAD          0xfe
70
71/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
72#define BOCU1_MAX_TRAIL         0xffL
73#define BOCU1_RESET             0xff
74
75/* number of lead bytes */
76#define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
77
78/* adjust trail byte counts for the use of some C0 control byte values */
79#define BOCU1_TRAIL_CONTROLS_COUNT  20
80#define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
81
82/* number of trail bytes */
83#define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
84
85/*
86 * number of positive and negative single-byte codes
87 * (counting 0==BOCU1_MIDDLE among the positive ones)
88 */
89#define BOCU1_SINGLE            64
90
91/* number of lead bytes for positive and negative 2/3/4-byte sequences */
92#define BOCU1_LEAD_2            43
93#define BOCU1_LEAD_3            3
94#define BOCU1_LEAD_4            1
95
96/* The difference value range for single-byters. */
97#define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
98#define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
99
100/* The difference value range for double-byters. */
101#define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
102#define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
103
104/* The difference value range for 3-byters. */
105#define BOCU1_REACH_POS_3   \
106    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
107
108#define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
109
110/* The lead byte start values. */
111#define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
112#define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
113#define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
114     /* ==BOCU1_MAX_LEAD */
115
116#define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
117#define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
118#define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
119     /* ==BOCU1_MIN+1 */
120
121/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
122#define BOCU1_LENGTH_FROM_LEAD(lead) \
123    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
124     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
125     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
126
127/* The length of a byte sequence, according to its packed form. */
128#define BOCU1_LENGTH_FROM_PACKED(packed) \
129    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
130
131/*
132 * 12 commonly used C0 control codes (and space) are only used to encode
133 * themselves directly,
134 * which makes BOCU-1 MIME-usable and reasonably safe for
135 * ASCII-oriented software.
136 *
137 * These controls are
138 *  0   NUL
139 *
140 *  7   BEL
141 *  8   BS
142 *
143 *  9   TAB
144 *  a   LF
145 *  b   VT
146 *  c   FF
147 *  d   CR
148 *
149 *  e   SO
150 *  f   SI
151 *
152 * 1a   SUB
153 * 1b   ESC
154 *
155 * The other 20 C0 controls are also encoded directly (to preserve order)
156 * but are also used as trail bytes in difference encoding
157 * (for better compression).
158 */
159#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
160
161/*
162 * Byte value map for control codes,
163 * from external byte values 0x00..0x20
164 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
165 * External byte values that are illegal as trail bytes are mapped to -1.
166 */
167static const int8_t
168bocu1ByteToTrail[BOCU1_MIN]={
169/*  0     1     2     3     4     5     6     7    */
170    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
171
172/*  8     9     a     b     c     d     e     f    */
173    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
174
175/*  10    11    12    13    14    15    16    17   */
176    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
177
178/*  18    19    1a    1b    1c    1d    1e    1f   */
179    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
180
181/*  20   */
182    -1
183};
184
185/*
186 * Byte value map for control codes,
187 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
188 * to external byte values 0x00..0x20.
189 */
190static const int8_t
191bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
192/*  0     1     2     3     4     5     6     7    */
193    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
194
195/*  8     9     a     b     c     d     e     f    */
196    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
197
198/*  10    11    12    13   */
199    0x1c, 0x1d, 0x1e, 0x1f
200};
201
202/**
203 * Integer division and modulo with negative numerators
204 * yields negative modulo results and quotients that are one more than
205 * what we need here.
206 * This macro adjust the results so that the modulo-value m is always >=0.
207 *
208 * For positive n, the if() condition is always FALSE.
209 *
210 * @param n Number to be split into quotient and rest.
211 *          Will be modified to contain the quotient.
212 * @param d Divisor.
213 * @param m Output variable for the rest (modulo result).
214 */
215#define NEGDIVMOD(n, d, m) { \
216    (m)=(n)%(d); \
217    (n)/=(d); \
218    if((m)<0) { \
219        --(n); \
220        (m)+=(d); \
221    } \
222}
223
224/* State for BOCU-1 decoder function. */
225struct Bocu1Rx {
226    int32_t prev, count, diff;
227};
228
229typedef struct Bocu1Rx Bocu1Rx;
230
231/* Function prototypes ------------------------------------------------------ */
232
233/* see bocu1.c */
234U_CFUNC int32_t
235packDiff(int32_t diff);
236
237U_CFUNC int32_t
238encodeBocu1(int32_t *pPrev, int32_t c);
239
240U_CFUNC int32_t
241decodeBocu1(Bocu1Rx *pRx, uint8_t b);
242
243/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
244
245/* BOCU-1 implementation functions ------------------------------------------ */
246
247/**
248 * Compute the next "previous" value for differencing
249 * from the current code point.
250 *
251 * @param c current code point, 0..0x10ffff
252 * @return "previous code point" state value
253 */
254static int32_t
255bocu1Prev(int32_t c) {
256    /* compute new prev */
257    if(0x3040<=c && c<=0x309f) {
258        /* Hiragana is not 128-aligned */
259        return 0x3070;
260    } else if(0x4e00<=c && c<=0x9fa5) {
261        /* CJK Unihan */
262        return 0x4e00-BOCU1_REACH_NEG_2;
263    } else if(0xac00<=c && c<=0xd7a3) {
264        /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
265        return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
266    } else {
267        /* mostly small scripts */
268        return (c&~0x7f)+BOCU1_ASCII_PREV;
269    }
270}
271
272/**
273 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
274 * and return a packed integer with them.
275 *
276 * The encoding favors small absolut differences with short encodings
277 * to compress runs of same-script characters.
278 *
279 * @param diff difference value -0x10ffff..0x10ffff
280 * @return
281 *      0x010000zz for 1-byte sequence zz
282 *      0x0200yyzz for 2-byte sequence yy zz
283 *      0x03xxyyzz for 3-byte sequence xx yy zz
284 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 */
286U_CFUNC int32_t
287packDiff(int32_t diff) {
288    int32_t result, m, lead, count, shift;
289
290    if(diff>=BOCU1_REACH_NEG_1) {
291        /* mostly positive differences, and single-byte negative ones */
292        if(diff<=BOCU1_REACH_POS_1) {
293            /* single byte */
294            return 0x01000000|(BOCU1_MIDDLE+diff);
295        } else if(diff<=BOCU1_REACH_POS_2) {
296            /* two bytes */
297            diff-=BOCU1_REACH_POS_1+1;
298            lead=BOCU1_START_POS_2;
299            count=1;
300        } else if(diff<=BOCU1_REACH_POS_3) {
301            /* three bytes */
302            diff-=BOCU1_REACH_POS_2+1;
303            lead=BOCU1_START_POS_3;
304            count=2;
305        } else {
306            /* four bytes */
307            diff-=BOCU1_REACH_POS_3+1;
308            lead=BOCU1_START_POS_4;
309            count=3;
310        }
311    } else {
312        /* two- and four-byte negative differences */
313        if(diff>=BOCU1_REACH_NEG_2) {
314            /* two bytes */
315            diff-=BOCU1_REACH_NEG_1;
316            lead=BOCU1_START_NEG_2;
317            count=1;
318        } else if(diff>=BOCU1_REACH_NEG_3) {
319            /* three bytes */
320            diff-=BOCU1_REACH_NEG_2;
321            lead=BOCU1_START_NEG_3;
322            count=2;
323        } else {
324            /* four bytes */
325            diff-=BOCU1_REACH_NEG_3;
326            lead=BOCU1_START_NEG_4;
327            count=3;
328        }
329    }
330
331    /* encode the length of the packed result */
332    if(count<3) {
333        result=(count+1)<<24;
334    } else /* count==3, MSB used for the lead byte */ {
335        result=0;
336    }
337
338    /* calculate trail bytes like digits in itoa() */
339    shift=0;
340    do {
341        NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
342        result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
343        shift+=8;
344    } while(--count>0);
345
346    /* add lead byte */
347    result|=(lead+diff)<<shift;
348
349    return result;
350}
351
352/**
353 * BOCU-1 encoder function.
354 *
355 * @param pPrev pointer to the integer that holds
356 *        the "previous code point" state;
357 *        the initial value should be 0 which
358 *        encodeBocu1 will set to the actual BOCU-1 initial state value
359 * @param c the code point to encode
360 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
361 *         or 0 if an error occurs
362 *
363 * @see packDiff
364 */
365U_CFUNC int32_t
366encodeBocu1(int32_t *pPrev, int32_t c) {
367    int32_t prev;
368
369    if(pPrev==NULL || c<0 || c>0x10ffff) {
370        /* illegal argument */
371        return 0;
372    }
373
374    prev=*pPrev;
375    if(prev==0) {
376        /* lenient handling of initial value 0 */
377        prev=*pPrev=BOCU1_ASCII_PREV;
378    }
379
380    if(c<=0x20) {
381        /*
382         * ISO C0 control & space:
383         * Encode directly for MIME compatibility,
384         * and reset state except for space, to not disrupt compression.
385         */
386        if(c!=0x20) {
387            *pPrev=BOCU1_ASCII_PREV;
388        }
389        return 0x01000000|c;
390    }
391
392    /*
393     * all other Unicode code points c==U+0021..U+10ffff
394     * are encoded with the difference c-prev
395     *
396     * a new prev is computed from c,
397     * placed in the middle of a 0x80-block (for most small scripts) or
398     * in the middle of the Unihan and Hangul blocks
399     * to statistically minimize the following difference
400     */
401    *pPrev=bocu1Prev(c);
402    return packDiff(c-prev);
403}
404
405/**
406 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
407 *
408 * @param pRx pointer to the decoder state structure
409 * @param b lead byte;
410 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
411 * @return -1 (state change only)
412 *
413 * @see decodeBocu1
414 */
415static int32_t
416decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
417    int32_t c, count;
418
419    if(b>=BOCU1_START_NEG_2) {
420        /* positive difference */
421        if(b<BOCU1_START_POS_3) {
422            /* two bytes */
423            c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
424            count=1;
425        } else if(b<BOCU1_START_POS_4) {
426            /* three bytes */
427            c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
428            count=2;
429        } else {
430            /* four bytes */
431            c=BOCU1_REACH_POS_3+1;
432            count=3;
433        }
434    } else {
435        /* negative difference */
436        if(b>=BOCU1_START_NEG_3) {
437            /* two bytes */
438            c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
439            count=1;
440        } else if(b>BOCU1_MIN) {
441            /* three bytes */
442            c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
443            count=2;
444        } else {
445            /* four bytes */
446            c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
447            count=3;
448        }
449    }
450
451    /* set the state for decoding the trail byte(s) */
452    pRx->diff=c;
453    pRx->count=count;
454    return -1;
455}
456
457/**
458 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
459 *
460 * @param pRx pointer to the decoder state structure
461 * @param b trail byte
462 * @return result value, same as decodeBocu1
463 *
464 * @see decodeBocu1
465 */
466static int32_t
467decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
468    int32_t t, c, count;
469
470    if(b<=0x20) {
471        /* skip some C0 controls and make the trail byte range contiguous */
472        t=bocu1ByteToTrail[b];
473        if(t<0) {
474            /* illegal trail byte value */
475            pRx->prev=BOCU1_ASCII_PREV;
476            pRx->count=0;
477            return -99;
478        }
479#if BOCU1_MAX_TRAIL<0xff
480    } else if(b>BOCU1_MAX_TRAIL) {
481        return -99;
482#endif
483    } else {
484        t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
485    }
486
487    /* add trail byte into difference and decrement count */
488    c=pRx->diff;
489    count=pRx->count;
490
491    if(count==1) {
492        /* final trail byte, deliver a code point */
493        c=pRx->prev+c+t;
494        if(0<=c && c<=0x10ffff) {
495            /* valid code point result */
496            pRx->prev=bocu1Prev(c);
497            pRx->count=0;
498            return c;
499        } else {
500            /* illegal code point result */
501            pRx->prev=BOCU1_ASCII_PREV;
502            pRx->count=0;
503            return -99;
504        }
505    }
506
507    /* intermediate trail byte */
508    if(count==2) {
509        pRx->diff=c+t*BOCU1_TRAIL_COUNT;
510    } else /* count==3 */ {
511        pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
512    }
513    pRx->count=count-1;
514    return -1;
515}
516
517/**
518 * BOCU-1 decoder function.
519 *
520 * @param pRx pointer to the decoder state structure;
521 *        the initial values should be 0 which
522 *        decodeBocu1 will set to actual initial state values
523 * @param b an input byte
524 * @return
525 *      0..0x10ffff for a result code point
526 *      -1 if only the state changed without code point output
527 *     <-1 if an error occurs
528 */
529U_CFUNC int32_t
530decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
531    int32_t prev, c, count;
532
533    if(pRx==NULL) {
534        /* illegal argument */
535        return -99;
536    }
537
538    prev=pRx->prev;
539    if(prev==0) {
540        /* lenient handling of initial 0 values */
541        prev=pRx->prev=BOCU1_ASCII_PREV;
542        count=pRx->count=0;
543    } else {
544        count=pRx->count;
545    }
546
547    if(count==0) {
548        /* byte in lead position */
549        if(b<=0x20) {
550            /*
551             * Direct-encoded C0 control code or space.
552             * Reset prev for C0 control codes but not for space.
553             */
554            if(b!=0x20) {
555                pRx->prev=BOCU1_ASCII_PREV;
556            }
557            return b;
558        }
559
560        /*
561         * b is a difference lead byte.
562         *
563         * Return a code point directly from a single-byte difference.
564         *
565         * For multi-byte difference lead bytes, set the decoder state
566         * with the partial difference value from the lead byte and
567         * with the number of trail bytes.
568         *
569         * For four-byte differences, the signedness also affects the
570         * first trail byte, which has special handling farther below.
571         */
572        if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
573            /* single-byte difference */
574            c=prev+((int32_t)b-BOCU1_MIDDLE);
575            pRx->prev=bocu1Prev(c);
576            return c;
577        } else if(b==BOCU1_RESET) {
578            /* only reset the state, no code point */
579            pRx->prev=BOCU1_ASCII_PREV;
580            return -1;
581        } else {
582            return decodeBocu1LeadByte(pRx, b);
583        }
584    } else {
585        /* trail byte in any position */
586        return decodeBocu1TrailByte(pRx, b);
587    }
588}
589
590/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
591
592/* test code ---------------------------------------------------------------- */
593
594/* test code options */
595
596/* ignore comma when processing name lists in testText() */
597#define TEST_IGNORE_COMMA       1
598
599/**
600 * Write a packed BOCU-1 byte sequence into a byte array,
601 * without overflow check.
602 * Test function.
603 *
604 * @param packed packed BOCU-1 byte sequence, see packDiff()
605 * @param p pointer to byte array
606 * @return number of bytes
607 *
608 * @see packDiff
609 */
610static int32_t
611writePacked(int32_t packed, uint8_t *p) {
612    int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
613    switch(count) {
614    case 4:
615        *p++=(uint8_t)(packed>>24);
616    case 3:
617        *p++=(uint8_t)(packed>>16);
618    case 2:
619        *p++=(uint8_t)(packed>>8);
620    case 1:
621        *p++=(uint8_t)packed;
622    default:
623        break;
624    }
625
626    return count;
627}
628
629/**
630 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
631 * the difference to initialPrev.
632 * Used only for round-trip testing of the difference encoding and decoding.
633 * Test function.
634 *
635 * @param initialPrev bogus "previous code point" value to make sure that
636 *                    the resulting code point is in the range 0..0x10ffff
637 * @param packed packed BOCU-1 byte sequence
638 * @return the difference to initialPrev
639 *
640 * @see packDiff
641 * @see writeDiff
642 */
643static int32_t
644unpackDiff(int32_t initialPrev, int32_t packed) {
645    Bocu1Rx rx={ 0, 0, 0 };
646    int32_t count;
647
648    rx.prev=initialPrev;
649    count=BOCU1_LENGTH_FROM_PACKED(packed);
650    switch(count) {
651    case 4:
652        decodeBocu1(&rx, (uint8_t)(packed>>24));
653    case 3:
654        decodeBocu1(&rx, (uint8_t)(packed>>16));
655    case 2:
656        decodeBocu1(&rx, (uint8_t)(packed>>8));
657    case 1:
658        /* subtract initial prev */
659        return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
660    default:
661        return -0x7fffffff;
662    }
663}
664
665/**
666 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
667 * preserving lexical order.
668 * Also checks for roundtripping of the difference encoding.
669 * Test function.
670 *
671 * @param diff difference value to test, -0x10ffff..0x10ffff
672 * @param p pointer to output byte array
673 * @return p advanced by number of bytes output
674 *
675 * @see unpackDiff
676 */
677static uint8_t *
678writeDiff(int32_t diff, uint8_t *p) {
679    /* generate the difference as a packed value and serialize it */
680    int32_t packed, initialPrev;
681
682    packed=packDiff(diff);
683
684    /*
685     * bogus initial "prev" to work around
686     * code point range check in decodeBocu1()
687     */
688    if(diff<=0) {
689        initialPrev=0x10ffff;
690    } else {
691        initialPrev=-1;
692    }
693
694    if(diff!=unpackDiff(initialPrev, packed)) {
695        log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
696                diff, packed, unpackDiff(initialPrev, packed));
697    }
698    return p+writePacked(packed, p);
699}
700
701/**
702 * Encode a UTF-16 string in BOCU-1.
703 * Does not check for overflows, but otherwise useful function.
704 *
705 * @param s input UTF-16 string
706 * @param length number of UChar code units in s
707 * @param p pointer to output byte array
708 * @return number of bytes output
709 */
710static int32_t
711writeString(const UChar *s, int32_t length, uint8_t *p) {
712    uint8_t *p0;
713    int32_t c, prev, i;
714
715    prev=0;
716    p0=p;
717    i=0;
718    while(i<length) {
719        U16_NEXT(s, i, length, c);
720        p+=writePacked(encodeBocu1(&prev, c), p);
721    }
722    return (int32_t)(p-p0);
723}
724
725/**
726 * Decode a BOCU-1 byte sequence to a UTF-16 string.
727 * Does not check for overflows, but otherwise useful function.
728 *
729 * @param p pointer to input BOCU-1 bytes
730 * @param length number of input bytes
731 * @param s point to output UTF-16 string array
732 * @return number of UChar code units output
733 */
734static int32_t
735readString(const uint8_t *p, int32_t length, UChar *s) {
736    Bocu1Rx rx={ 0, 0, 0 };
737    int32_t c, i, sLength;
738
739    i=sLength=0;
740    while(i<length) {
741        c=decodeBocu1(&rx, p[i++]);
742        if(c<-1) {
743            log_err("error: readString detects encoding error at string index %ld\n", i);
744            return -1;
745        }
746        if(c>=0) {
747            U16_APPEND_UNSAFE(s, sLength, c);
748        }
749    }
750    return sLength;
751}
752
753static char
754hexDigit(uint8_t digit) {
755    return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
756}
757
758/**
759 * Pretty-print 0-terminated byte values.
760 * Helper function for test output.
761 *
762 * @param bytes 0-terminated byte array to print
763 */
764static void
765printBytes(uint8_t *bytes, char *out) {
766    int i;
767    uint8_t b;
768
769    i=0;
770    while((b=*bytes++)!=0) {
771        *out++=' ';
772        *out++=hexDigit((uint8_t)(b>>4));
773        *out++=hexDigit((uint8_t)(b&0xf));
774        ++i;
775    }
776    i=3*(5-i);
777    while(i>0) {
778        *out++=' ';
779        --i;
780    }
781    *out=0;
782}
783
784/**
785 * Basic BOCU-1 test function, called when there are no command line arguments.
786 * Prints some of the #define values and performs round-trip tests of the
787 * difference encoding and decoding.
788 */
789static void
790TestBOCU1RefDiff(void) {
791    char buf1[80], buf2[80];
792    uint8_t prev[5], level[5];
793    int32_t i, cmp, countErrors;
794
795    log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
796    log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
797    log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
798
799    log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
800    log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
801    log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
802
803    log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
804    log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
805    log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
806
807    /* test packDiff() & unpackDiff() with some specific values */
808    writeDiff(0, level);
809    writeDiff(1, level);
810    writeDiff(65, level);
811    writeDiff(130, level);
812    writeDiff(30000, level);
813    writeDiff(1000000, level);
814    writeDiff(-65, level);
815    writeDiff(-130, level);
816    writeDiff(-30000, level);
817    writeDiff(-1000000, level);
818
819    /* test that each value is smaller than any following one */
820    countErrors=0;
821    i=-0x10ffff;
822    *writeDiff(i, prev)=0;
823
824    /* show first number and bytes */
825    printBytes(prev, buf1);
826    log_verbose("              wD(%8ld)                    %s\n", i, buf1);
827
828    for(++i; i<=0x10ffff; ++i) {
829        *writeDiff(i, level)=0;
830        cmp=strcmp((const char *)prev, (const char *)level);
831        if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
832            log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
833                   level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
834        }
835        if(cmp<0) {
836            if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
837                /*
838                 * if the result is good, then print only if the length changed
839                 * to get little but interesting output
840                 */
841                printBytes(prev, buf1);
842                printBytes(level, buf2);
843                log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
844            }
845        } else {
846            ++countErrors;
847            printBytes(prev, buf1);
848            printBytes(level, buf2);
849            log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
850        }
851        /* remember the previous bytes */
852        memcpy(prev, level, 4);
853    }
854
855    /* show last number and bytes */
856    printBytes((uint8_t *)"", buf1);
857    printBytes(prev, buf2);
858    log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
859
860    if(countErrors==0) {
861        log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
862    } else {
863        log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
864    }
865
866    /* output signature byte sequence */
867    i=0;
868    writePacked(encodeBocu1(&i, 0xfeff), level);
869    log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
870            level[0], level[1], level[2]);
871}
872
873/* cintltst code ------------------------------------------------------------ */
874
875static const int32_t DEFAULT_BUFFER_SIZE = 30000;
876
877
878/* test one string with the ICU and the reference BOCU-1 implementations */
879static void
880roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
881    UChar *roundtripRef, *roundtripICU;
882    char *bocu1Ref, *bocu1ICU;
883
884    int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
885    UErrorCode errorCode;
886
887    roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
888    roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
889    bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
890    bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
891
892    /* Unicode -> BOCU-1 */
893    bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
894
895    errorCode=U_ZERO_ERROR;
896    bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
897    if(U_FAILURE(errorCode)) {
898        log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
899        goto cleanup;
900    }
901
902    if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
903        log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
904        goto cleanup;
905    }
906
907    /* BOCU-1 -> Unicode */
908    roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
909    if(roundtripRefLength<0) {
910        goto cleanup; /* readString() found an error and reported it */
911    }
912
913    roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
914    if(U_FAILURE(errorCode)) {
915        log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
916        goto cleanup;
917    }
918
919    if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
920        log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
921        goto cleanup;
922    }
923    if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
924        log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
925        goto cleanup;
926    }
927cleanup:
928    free(roundtripRef);
929    free(roundtripICU);
930    free(bocu1Ref);
931    free(bocu1ICU);
932}
933
934static const UChar feff[]={ 0xfeff };
935static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
936static const UChar crlf[]={ 0xd, 0xa, 0x20 };
937static const UChar nul[]={ 0 };
938static const UChar latin[]={ 0xdf, 0xe6 };
939static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
940static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
941static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
942static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
943static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
944static const UChar plane1[]={ 0xd800, 0xdc00 };
945static const UChar plane2[]={ 0xd845, 0xdddd };
946static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
947static const UChar plane16[]={ 0xdbff, 0xdfff };
948static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
949
950static const struct {
951    const UChar *s;
952    int32_t length;
953} strings[]={
954    { feff,         LENGTHOF(feff) },
955    { ascii,        LENGTHOF(ascii) },
956    { crlf,         LENGTHOF(crlf) },
957    { nul,          LENGTHOF(nul) },
958    { latin,        LENGTHOF(latin) },
959    { devanagari,   LENGTHOF(devanagari) },
960    { hiragana,     LENGTHOF(hiragana) },
961    { unihan,       LENGTHOF(unihan) },
962    { hangul,       LENGTHOF(hangul) },
963    { surrogates,   LENGTHOF(surrogates) },
964    { plane1,       LENGTHOF(plane1) },
965    { plane2,       LENGTHOF(plane2) },
966    { plane15,      LENGTHOF(plane15) },
967    { plane16,      LENGTHOF(plane16) },
968    { c0,           LENGTHOF(c0) }
969};
970
971/*
972 * Verify that the ICU BOCU-1 implementation produces the same results as
973 * the reference implementation from the design folder.
974 * Generate some texts and convert them with both converters, verifying
975 * identical results and roundtripping.
976 */
977static void
978TestBOCU1(void) {
979    UChar *text;
980    int32_t i, length;
981
982    UConverter *bocu1;
983    UErrorCode errorCode;
984
985    errorCode=U_ZERO_ERROR;
986    bocu1=ucnv_open("BOCU-1", &errorCode);
987    if(U_FAILURE(errorCode)) {
988        log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
989        return;
990    }
991
992    text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
993
994    /* text 1: each of strings[] once */
995    length=0;
996    for(i=0; i<LENGTHOF(strings); ++i) {
997        u_memcpy(text+length, strings[i].s, strings[i].length);
998        length+=strings[i].length;
999    }
1000    roundtripBOCU1(bocu1, 1, text, length);
1001
1002    /* text 2: each of strings[] twice */
1003    length=0;
1004    for(i=0; i<LENGTHOF(strings); ++i) {
1005        u_memcpy(text+length, strings[i].s, strings[i].length);
1006        length+=strings[i].length;
1007        u_memcpy(text+length, strings[i].s, strings[i].length);
1008        length+=strings[i].length;
1009    }
1010    roundtripBOCU1(bocu1, 2, text, length);
1011
1012    /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1013    length=0;
1014    for(i=1; length<5000; i+=7) {
1015        if(i>=LENGTHOF(strings)) {
1016            i-=LENGTHOF(strings);
1017        }
1018        u_memcpy(text+length, strings[i].s, strings[i].length);
1019        length+=strings[i].length;
1020    }
1021    roundtripBOCU1(bocu1, 3, text, length);
1022
1023    ucnv_close(bocu1);
1024    free(text);
1025}
1026
1027U_CFUNC void addBOCU1Tests(TestNode** root);
1028
1029U_CFUNC void
1030addBOCU1Tests(TestNode** root) {
1031    addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1032    addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1033}
1034