1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1998-2006, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/*
7* File test.c
8*
9* Modification History:
10*
11*   Date          Name        Description
12*   07/24/2000    Madhu       Creation
13*******************************************************************************
14*/
15
16#include "unicode/utypes.h"
17#include "unicode/utf8.h"
18#include "cmemory.h"
19#include "cintltst.h"
20
21#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
22
23/* lenient UTF-8 ------------------------------------------------------------ */
24
25/*
26 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
27 * code points with their "natural" encoding.
28 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29 * single surrogates.
30 *
31 * This is not conformant with UTF-8.
32 *
33 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
34 * the macros below do not attempt to assemble such pairs.
35 */
36
37#define L8_NEXT(s, i, length, c) { \
38    (c)=(uint8_t)(s)[(i)++]; \
39    if((c)>=0x80) { \
40        if(U8_IS_LEAD(c)) { \
41            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
42        } else { \
43            (c)=U_SENTINEL; \
44        } \
45    } \
46}
47
48#define L8_PREV(s, start, i, c) { \
49    (c)=(uint8_t)(s)[--(i)]; \
50    if((c)>=0x80) { \
51        if((c)<=0xbf) { \
52            (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
53        } else { \
54            (c)=U_SENTINEL; \
55        } \
56    } \
57}
58
59/* -------------------------------------------------------------------------- */
60
61static void printUChars(const uint8_t *uchars, int16_t len);
62
63static void TestCodeUnitValues(void);
64static void TestCharLength(void);
65static void TestGetChar(void);
66static void TestNextPrevChar(void);
67static void TestFwdBack(void);
68static void TestSetChar(void);
69static void TestAppendChar(void);
70static void TestAppend(void);
71static void TestSurrogates(void);
72
73void addUTF8Test(TestNode** root);
74
75void
76addUTF8Test(TestNode** root)
77{
78  addTest(root, &TestCodeUnitValues,    "utf8tst/TestCodeUnitValues");
79  addTest(root, &TestCharLength,        "utf8tst/TestCharLength"    );
80  addTest(root, &TestGetChar,           "utf8tst/TestGetChar"       );
81  addTest(root, &TestNextPrevChar,      "utf8tst/TestNextPrevChar"  );
82  addTest(root, &TestFwdBack,           "utf8tst/TestFwdBack"       );
83  addTest(root, &TestSetChar,           "utf8tst/TestSetChar"       );
84  addTest(root, &TestAppendChar,        "utf8tst/TestAppendChar"    );
85  addTest(root, &TestAppend,            "utf8tst/TestAppend"        );
86  addTest(root, &TestSurrogates,        "utf8tst/TestSurrogates"    );
87}
88
89static void TestCodeUnitValues()
90{
91    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
92
93    int16_t i;
94    for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){
95        uint8_t c=codeunit[i];
96        log_verbose("Testing code unit value of %x\n", c);
97        if(i<4){
98            if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
99                log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
100                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
101            }
102        } else if(i< 8){
103            if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
104                log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
105                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
106            }
107        } else if(i< 12){
108            if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
109                log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
110                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
111            }
112        }
113    }
114}
115
116static void TestCharLength()
117{
118    static const uint32_t codepoint[]={
119        1, 0x0061,
120        1, 0x007f,
121        2, 0x016f,
122        2, 0x07ff,
123        3, 0x0865,
124        3, 0x20ac,
125        4, 0x20402,
126        4, 0x23456,
127        4, 0x24506,
128        4, 0x20402,
129        4, 0x10402,
130        3, 0xd7ff,
131        3, 0xe000,
132
133    };
134
135    int16_t i;
136    UBool multiple;
137    for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
138        UChar32 c=codepoint[i+1];
139        if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
140              log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
141        }else{
142              log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c) );
143        }
144        multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
145        if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
146              log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
147        }
148    }
149}
150
151static void TestGetChar()
152{
153    static const uint8_t input[]={
154    /*  code unit,*/
155        0x61,
156        0x7f,
157        0xe4,
158        0xba,
159        0x8c,
160        0xF0,
161        0x90,
162        0x90,
163        0x81,
164        0xc0,
165        0x65,
166        0x31,
167        0x9a,
168        0xc9
169    };
170    static const UChar32 result[]={
171     /*codepoint-unsafe,  codepoint-safe(not strict)  codepoint-safe(strict)*/
172        0x61,             0x61,                       0x61,
173        0x7f,             0x7f,                       0x7f,
174        0x4e8c,           0x4e8c,                     0x4e8c,
175        0x4e8c,           0x4e8c,                     0x4e8c ,
176        0x4e8c,           0x4e8c,                     0x4e8c,
177        0x10401,          0x10401,                    0x10401 ,
178        0x10401,          0x10401,                    0x10401 ,
179        0x10401,          0x10401,                    0x10401 ,
180        0x10401,          0x10401,                    0x10401,
181        0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
182        0x65,             0x65,                       0x65,
183        0x31,             0x31,                       0x31,
184        0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
185        0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
186    };
187    uint16_t i=0;
188    UChar32 c;
189    uint32_t offset=0;
190
191    for(offset=0; offset<sizeof(input); offset++) {
192        if (offset < sizeof(input) - 1) {
193            UTF8_GET_CHAR_UNSAFE(input, offset, c);
194            if(c != result[i]){
195                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
196
197            }
198
199            U8_GET_UNSAFE(input, offset, c);
200            if(c != result[i]){
201                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
202
203            }
204        }
205
206        U8_GET(input, 0, offset, sizeof(input), c);
207        if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
208            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
209        }
210
211        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
212        if(c != result[i+1]){
213            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
214        }
215
216        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
217        if(c != result[i+2]){
218            log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
219        }
220
221         i=(uint16_t)(i+3);
222    }
223}
224
225static void TestNextPrevChar(){
226    static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
227    static const UChar32 result[]={
228    /*next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns         prev_safe_s*/
229        0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
230        0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
231        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
232        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
233        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
234        0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
235        0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
236        0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
237        0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
238        0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
239        0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
240        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
241        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
242        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
243        0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
244        0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
245    };
246    static const int32_t movedOffset[]={
247   /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
248        1,            1,           1,                15,           15,               15,
249        5,            5,           5,                14,           14 ,              14,
250        3,            3,           3,                9,            13,               13,
251        4,            4,           4,                9,            12,               12,
252        5,            5,           5,                9,            11,               11,
253        7,            7,           7,                10,           10,               10,
254        7,            7,           7,                9,            9,                9,
255        8,            9,           9,                7,            7,                7,
256        9,            9,           9,                7,            7,                7,
257        11,           10,          10,               5,            5,                5,
258        11,           11,          11,               5,            5,                5,
259        12,           12,          12,               1,            1,                1,
260        13,           13,          13,               1,            1,                1,
261        14,           14,          14,               1,            1,                1,
262        14,           15,          15,               1,            1,                1,
263        14,           16,          16,               0,            0,                0,
264
265
266    };
267
268
269    UChar32 c=0x0000;
270    uint32_t i=0;
271    uint32_t offset=0;
272    int32_t setOffset=0;
273    for(offset=0; offset<sizeof(input); offset++){
274         if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */
275             setOffset=offset;
276             UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c);
277             if(setOffset != movedOffset[i]){
278                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
279                     offset, movedOffset[i], setOffset);
280             }
281             if(c != result[i]){
282                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
283             }
284
285             setOffset=offset;
286             U8_NEXT_UNSAFE(input, setOffset, c);
287             if(setOffset != movedOffset[i]){
288                 log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
289                     offset, movedOffset[i], setOffset);
290             }
291             if(c != result[i]){
292                 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
293             }
294         }
295
296         setOffset=offset;
297         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
298         if(setOffset != movedOffset[i+1]){
299             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
300                 offset, movedOffset[i+1], setOffset);
301         }
302         if(c != result[i+1]){
303             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
304         }
305
306         setOffset=offset;
307         U8_NEXT(input, setOffset, sizeof(input), c);
308         if(setOffset != movedOffset[i+1]){
309             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
310                 offset, movedOffset[i+1], setOffset);
311         }
312         if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
313             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
314         }
315
316         setOffset=offset;
317         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
318         if(setOffset != movedOffset[i+1]){
319             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
320                 offset, movedOffset[i+2], setOffset);
321         }
322         if(c != result[i+2]){
323             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
324         }
325
326         i=i+6;
327    }
328
329    i=0;
330    for(offset=sizeof(input); offset > 0; --offset){
331         setOffset=offset;
332         UTF8_PREV_CHAR_UNSAFE(input, setOffset, c);
333         if(setOffset != movedOffset[i+3]){
334             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
335                 offset, movedOffset[i+3], setOffset);
336         }
337         if(c != result[i+3]){
338             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
339         }
340
341         setOffset=offset;
342         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
343         if(setOffset != movedOffset[i+4]){
344             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
345                 offset, movedOffset[i+4], setOffset);
346         }
347         if(c != result[i+4]){
348             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
349         }
350
351         setOffset=offset;
352         U8_PREV(input, 0, setOffset, c);
353         if(setOffset != movedOffset[i+4]){
354             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
355                 offset, movedOffset[i+4], setOffset);
356         }
357         if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
358             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
359         }
360
361         setOffset=offset;
362         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
363         if(setOffset != movedOffset[i+5]){
364             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
365                 offset, movedOffset[i+5], setOffset);
366         }
367         if(c != result[i+5]){
368             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
369         }
370
371         i=i+6;
372    }
373
374    {
375        /* test non-characters */
376        static const uint8_t nonChars[]={
377            0xef, 0xb7, 0x90,       /* U+fdd0 */
378            0xef, 0xbf, 0xbf,       /* U+feff */
379            0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
380            0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
381            0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
382        };
383
384        UChar32 ch;
385        int32_t idx;
386
387        for(idx=0; idx<(int32_t)sizeof(nonChars);) {
388            U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
389            if(!U_IS_UNICODE_NONCHAR(ch)) {
390                log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
391            }
392        }
393        for(idx=(int32_t)sizeof(nonChars); idx>0;) {
394            U8_PREV(nonChars, 0, idx, ch);
395            if(!U_IS_UNICODE_NONCHAR(ch)) {
396                log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
397            }
398        }
399    }
400}
401
402static void TestFwdBack(){
403    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
404    static const uint16_t fwd_unsafe[] ={1, 5, 6, 7,  9, 10, 11, 13, 14, 15, 16,  20, };
405    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
406    static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
407    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
408
409    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
410    static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
411    static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
412    static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
413    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
414
415
416    uint32_t offunsafe=0, offsafe=0;
417
418    uint32_t i=0;
419    while(offunsafe < sizeof(input)){
420        UTF8_FWD_1_UNSAFE(input, offunsafe);
421        if(offunsafe != fwd_unsafe[i]){
422            log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
423        }
424        i++;
425    }
426
427    i=0;
428    while(offunsafe < sizeof(input)){
429        U8_FWD_1_UNSAFE(input, offunsafe);
430        if(offunsafe != fwd_unsafe[i]){
431            log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
432        }
433        i++;
434    }
435
436    i=0;
437    while(offsafe < sizeof(input)){
438        UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
439        if(offsafe != fwd_safe[i]){
440            log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
441        }
442        i++;
443    }
444
445    i=0;
446    while(offsafe < sizeof(input)){
447        U8_FWD_1(input, offsafe, sizeof(input));
448        if(offsafe != fwd_safe[i]){
449            log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
450        }
451        i++;
452    }
453
454    offunsafe=sizeof(input);
455    i=0;
456    while(offunsafe > 0){
457        UTF8_BACK_1_UNSAFE(input, offunsafe);
458        if(offunsafe != back_unsafe[i]){
459            log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
460        }
461        i++;
462    }
463
464    offunsafe=sizeof(input);
465    i=0;
466    while(offunsafe > 0){
467        U8_BACK_1_UNSAFE(input, offunsafe);
468        if(offunsafe != back_unsafe[i]){
469            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
470        }
471        i++;
472    }
473
474    i=0;
475    offsafe=sizeof(input);
476    while(offsafe > 0){
477        UTF8_BACK_1_SAFE(input, 0,  offsafe);
478        if(offsafe != back_safe[i]){
479            log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
480        }
481        i++;
482    }
483
484    i=0;
485    offsafe=sizeof(input);
486    while(offsafe > 0){
487        U8_BACK_1(input, 0,  offsafe);
488        if(offsafe != back_safe[i]){
489            log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
490        }
491        i++;
492    }
493
494    offunsafe=0;
495    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
496        UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
497        if(offunsafe != fwd_N_unsafe[i]){
498            log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
499        }
500    }
501
502    offunsafe=0;
503    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
504        U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
505        if(offunsafe != fwd_N_unsafe[i]){
506            log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
507        }
508    }
509
510    offsafe=0;
511    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
512        UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
513        if(offsafe != fwd_N_safe[i]){
514            log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
515        }
516
517    }
518
519    offsafe=0;
520    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
521        U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
522        if(offsafe != fwd_N_safe[i]){
523            log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
524        }
525
526    }
527
528    offunsafe=sizeof(input);
529    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
530        UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
531        if(offunsafe != back_N_unsafe[i]){
532            log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
533        }
534    }
535
536    offunsafe=sizeof(input);
537    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
538        U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
539        if(offunsafe != back_N_unsafe[i]){
540            log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
541        }
542    }
543
544    offsafe=sizeof(input);
545    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
546        UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
547        if(offsafe != back_N_safe[i]){
548            log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
549        }
550    }
551
552    offsafe=sizeof(input);
553    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
554        U8_BACK_N(input, 0, offsafe, Nvalue[i]);
555        if(offsafe != back_N_safe[i]){
556            log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
557        }
558    }
559}
560
561static void TestSetChar(){
562    static const uint8_t input[]
563        = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
564    static const int16_t start_unsafe[]
565        = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   13 };
566    static const int16_t start_safe[]
567        = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13 };
568    static const int16_t limit_unsafe[]
569        = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15 };
570    static const int16_t limit_safe[]
571        = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13 };
572
573    uint32_t i=0;
574    int32_t offset=0, setOffset=0;
575    for(offset=0; offset<(int32_t)sizeof(input); offset++){
576         setOffset=offset;
577         UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
578         if(setOffset != start_unsafe[i]){
579             log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
580         }
581
582         setOffset=offset;
583         U8_SET_CP_START_UNSAFE(input, setOffset);
584         if(setOffset != start_unsafe[i]){
585             log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
586         }
587
588         setOffset=offset;
589         UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
590         if(setOffset != start_safe[i]){
591             log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
592         }
593
594         setOffset=offset;
595         U8_SET_CP_START(input, 0, setOffset);
596         if(setOffset != start_safe[i]){
597             log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
598         }
599
600         if (offset != 0) { /* Can't have it go off the end of the array */
601             setOffset=offset;
602             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
603             if(setOffset != limit_unsafe[i]){
604                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
605             }
606
607             setOffset=offset;
608             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
609             if(setOffset != limit_unsafe[i]){
610                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
611             }
612         }
613
614         setOffset=offset;
615         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
616         if(setOffset != limit_safe[i]){
617             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
618         }
619
620         setOffset=offset;
621         U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
622         if(setOffset != limit_safe[i]){
623             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
624         }
625
626         i++;
627    }
628}
629
630static void TestAppendChar(){
631    static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
632    static const uint32_t test[]={
633     /*append-position(unsafe),  CHAR to be appended  */
634        0,                        0x10401,
635        2,                        0x0028,
636        2,                        0x007f,
637        3,                        0xd801,
638        1,                        0x20402,
639        8,                        0x10401,
640        5,                        0xc0,
641        5,                        0xc1,
642        5,                        0xfd,
643        6,                        0x80,
644        6,                        0x81,
645        6,                        0xbf,
646        7,                        0xfe,
647
648    /*append-position(safe),     CHAR to be appended */
649        0,                        0x10401,
650        2,                        0x0028,
651        3,                        0x7f,
652        3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
653        1,                        0x20402,
654        9,                        0x10401,
655        5,                        0xc0,
656        5,                        0xc1,
657        5,                        0xfd,
658        6,                        0x80,
659        6,                        0x81,
660        6,                        0xbf,
661        7,                        0xfe,
662
663    };
664    static const uint16_t movedOffset[]={
665        /*offset-moved-to(unsafe)*/
666          4,              /*for append-pos: 0 , CHAR 0x10401*/
667          3,
668          3,
669          6,
670          5,
671          12,
672          7,
673          7,
674          7,
675          8,
676          8,
677          8,
678          9,
679
680          /*offset-moved-to(safe)*/
681          4,              /*for append-pos: 0, CHAR  0x10401*/
682          3,
683          4,
684          6,
685          5,
686          11,
687          7,
688          7,
689          7,
690          8,
691          8,
692          8,
693          9,
694
695    };
696
697    static const uint8_t result[][11]={
698        /*unsafe*/
699        {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
700        {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
701        {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
702        {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
703        {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
704        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
705
706        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
707        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
708        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
709
710        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
711        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
712        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
713
714        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
715        /*safe*/
716        {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
717        {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
718        {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
719        {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
720        {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
721        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
722
723        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
724        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
725        {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
726
727        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
728        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
729        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
730
731        {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
732
733    };
734    uint16_t i, count=0;
735    uint8_t str[12];
736    uint32_t offset;
737/*    UChar32 c=0;*/
738    uint16_t size=sizeof(s)/sizeof(s[0]);
739    for(i=0; i<sizeof(test)/sizeof(test[0]); i=(uint16_t)(i+2)){
740        uprv_memcpy(str, s, size);
741        offset=test[i];
742        if(count<13){
743            UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
744            if(offset != movedOffset[count]){
745                log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
746                    count, movedOffset[count], offset);
747
748            }
749            if(uprv_memcmp(str, result[count], size) !=0){
750                log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
751                printUChars(result[count], size);
752                log_err("\nGot:      ");
753                printUChars(str, size);
754                log_err("\n");
755            }
756        }else{
757            UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
758            if(offset != movedOffset[count]){
759                log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
760                    count, movedOffset[count], offset);
761
762            }
763            if(uprv_memcmp(str, result[count], size) !=0){
764                log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
765                printUChars(result[count], size);
766                log_err("\nGot:     ");
767                printUChars(str, size);
768                log_err("\n");
769            }
770            /*call the API instead of MACRO
771            uprv_memcpy(str, s, size);
772            offset=test[i];
773            c=test[i+1];
774            if((uint32_t)(c)<=0x7f) {
775                  (str)[(offset)++]=(uint8_t)(c);
776            } else {
777                 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
778            }
779            if(offset != movedOffset[count]){
780                log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
781                    count, movedOffset[count], offset);
782
783            }
784            if(uprv_memcmp(str, result[count], size) !=0){
785                log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
786                printUChars(result[count], size);
787                printf("\nGot:     ");
788                printUChars(str, size);
789                printf("\n");
790            }
791            */
792        }
793        count++;
794    }
795
796
797}
798
799static void TestAppend() {
800    static const UChar32 codePoints[]={
801        0x61, 0xdf, 0x901, 0x3040,
802        0xac00, 0xd800, 0xdbff, 0xdcde,
803        0xdffd, 0xe000, 0xffff, 0x10000,
804        0x12345, 0xe0021, 0x10ffff, 0x110000,
805        0x234567, 0x7fffffff, -1, -1000,
806        0, 0x400
807    };
808    static const uint8_t expectUnsafe[]={
809        0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
810        0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
811        0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
812        0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
813        /* none from this line */
814        0,  0xd0, 0x80
815    }, expectSafe[]={
816        0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
817        0xea, 0xb0, 0x80,  /* no surrogates */
818        /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
819        0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
820        /* none from this line */
821        0,  0xd0, 0x80
822    };
823
824    uint8_t buffer[100];
825    UChar32 c;
826    int32_t i, length;
827    UBool isError, expectIsError, wrongIsError;
828
829    length=0;
830    for(i=0; i<LENGTHOF(codePoints); ++i) {
831        c=codePoints[i];
832        if(c<0 || 0x10ffff<c) {
833            continue; /* skip non-code points for U8_APPEND_UNSAFE */
834        }
835
836        U8_APPEND_UNSAFE(buffer, length, c);
837    }
838    if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
839        log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
840    }
841
842    length=0;
843    wrongIsError=FALSE;
844    for(i=0; i<LENGTHOF(codePoints); ++i) {
845        c=codePoints[i];
846        expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
847        isError=FALSE;
848
849        U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
850        wrongIsError|= isError!=expectIsError;
851    }
852    if(wrongIsError) {
853        log_err("U8_APPEND did not set isError correctly\n");
854    }
855    if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
856        log_err("U8_APPEND did not generate the expected output\n");
857    }
858}
859
860static void
861TestSurrogates() {
862    static const uint8_t b[]={
863        0xc3, 0x9f,             /*  00DF */
864        0xed, 0x9f, 0xbf,       /*  D7FF */
865        0xed, 0xa0, 0x81,       /*  D801 */
866        0xed, 0xbf, 0xbe,       /*  DFFE */
867        0xee, 0x80, 0x80,       /*  E000 */
868        0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
869    };
870    static const UChar32 cp[]={
871        0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
872    };
873
874    UChar32 cu, cs, cl;
875    int32_t i, j, k, iu, is, il, length;
876
877    k=0; /* index into cp[] */
878    length=LENGTHOF(b);
879    for(i=0; i<length;) {
880        j=i;
881        U8_NEXT_UNSAFE(b, j, cu);
882        iu=j;
883
884        j=i;
885        U8_NEXT(b, j, length, cs);
886        is=j;
887
888        j=i;
889        L8_NEXT(b, j, length, cl);
890        il=j;
891
892        if(cu!=cp[k]) {
893            log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
894        }
895
896        /* U8_NEXT() returns <0 for surrogate code points */
897        if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
898            log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
899        }
900
901        /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
902        if(cl!=cu) {
903            log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
904        }
905
906        if(is!=iu || il!=iu) {
907            log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
908        }
909
910        ++k;    /* next code point */
911        i=iu;   /* advance by one UTF-8 sequence */
912    }
913
914    while(i>0) {
915        --k; /* previous code point */
916
917        j=i;
918        U8_PREV_UNSAFE(b, j, cu);
919        iu=j;
920
921        j=i;
922        U8_PREV(b, 0, j, cs);
923        is=j;
924
925        j=i;
926        L8_PREV(b, 0, j, cl);
927        il=j;
928
929        if(cu!=cp[k]) {
930            log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
931        }
932
933        /* U8_PREV() returns <0 for surrogate code points */
934        if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
935            log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
936        }
937
938        /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
939        if(cl!=cu) {
940            log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
941        }
942
943        if(is!=iu || il !=iu) {
944            log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
945        }
946
947        i=iu;   /* go back by one UTF-8 sequence */
948    }
949}
950
951static void printUChars(const uint8_t *uchars, int16_t len){
952    int16_t i=0;
953    for(i=0; i<len; i++){
954        log_err("0x%02x ", *(uchars+i));
955    }
956}
957