1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uciter8.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003jan10
14*   created by: Markus W. Scherer
15*
16*   This file contains sample code that illustrates reading
17*   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
18*   and also accepting single surrogates.
19*/
20
21#include <stdio.h>
22#include <string.h>
23#include "unicode/utypes.h"
24#include "unicode/uiter.h"
25#include "uit_len8.h"
26
27#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
28
29#define log_err printf
30
31/* UCharIterator test ------------------------------------------------------- */
32
33/*
34 * The following code is a copy of the UCharIterator test code in
35 * source/test/cintltst/custrtst.c,
36 * testing the lenient-8 iterator instead of the UTF-8 one.
37 */
38
39/*
40 * Compare results from two iterators, should be same.
41 * Assume that the text is not empty and that
42 * iteration start==0 and iteration limit==length.
43 */
44static void
45compareIterators(UCharIterator *iter1, const char *n1,
46                 UCharIterator *iter2, const char *n2) {
47    int32_t i, pos1, pos2, middle, length;
48    UChar32 c1, c2;
49
50    /* compare lengths */
51    length=iter1->getIndex(iter1, UITER_LENGTH);
52    pos2=iter2->getIndex(iter2, UITER_LENGTH);
53    if(length!=pos2) {
54        log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
55        return;
56    }
57
58    /* set into the middle */
59    middle=length/2;
60
61    pos1=iter1->move(iter1, middle, UITER_ZERO);
62    if(pos1!=middle) {
63        log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
64        return;
65    }
66
67    pos2=iter2->move(iter2, middle, UITER_ZERO);
68    if(pos2!=middle) {
69        log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
70        return;
71    }
72
73    /* test current() */
74    c1=iter1->current(iter1);
75    c2=iter2->current(iter2);
76    if(c1!=c2) {
77        log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
78        return;
79    }
80
81    /* move forward 3 UChars */
82    for(i=0; i<3; ++i) {
83        c1=iter1->next(iter1);
84        c2=iter2->next(iter2);
85        if(c1!=c2) {
86            log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
87            return;
88        }
89    }
90
91    /* move backward 5 UChars */
92    for(i=0; i<5; ++i) {
93        c1=iter1->previous(iter1);
94        c2=iter2->previous(iter2);
95        if(c1!=c2) {
96            log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
97            return;
98        }
99    }
100
101    /* iterate forward from the beginning */
102    pos1=iter1->move(iter1, 0, UITER_START);
103    if(pos1<0) {
104        log_err("%s->move(start) failed\n", n1);
105        return;
106    }
107    if(!iter1->hasNext(iter1)) {
108        log_err("%s->hasNext() at the start returns FALSE\n", n1);
109        return;
110    }
111
112    pos2=iter2->move(iter2, 0, UITER_START);
113    if(pos2<0) {
114        log_err("%s->move(start) failed\n", n2);
115        return;
116    }
117    if(!iter2->hasNext(iter2)) {
118        log_err("%s->hasNext() at the start returns FALSE\n", n2);
119        return;
120    }
121
122    do {
123        c1=iter1->next(iter1);
124        c2=iter2->next(iter2);
125        if(c1!=c2) {
126            log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
127            return;
128        }
129    } while(c1>=0);
130
131    if(iter1->hasNext(iter1)) {
132        log_err("%s->hasNext() at the end returns TRUE\n", n1);
133        return;
134    }
135    if(iter2->hasNext(iter2)) {
136        log_err("%s->hasNext() at the end returns TRUE\n", n2);
137        return;
138    }
139
140    /* back to the middle */
141    pos1=iter1->move(iter1, middle, UITER_ZERO);
142    if(pos1!=middle) {
143        log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
144        return;
145    }
146
147    pos2=iter2->move(iter2, middle, UITER_ZERO);
148    if(pos2!=middle) {
149        log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
150        return;
151    }
152
153    /* move to index 1 */
154    pos1=iter1->move(iter1, 1, UITER_ZERO);
155    if(pos1!=1) {
156        log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
157        return;
158    }
159
160    pos2=iter2->move(iter2, 1, UITER_ZERO);
161    if(pos2!=1) {
162        log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
163        return;
164    }
165
166    /* iterate backward from the end */
167    pos1=iter1->move(iter1, 0, UITER_LIMIT);
168    if(pos1<0) {
169        log_err("%s->move(limit) failed\n", n1);
170        return;
171    }
172    if(!iter1->hasPrevious(iter1)) {
173        log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
174        return;
175    }
176
177    pos2=iter2->move(iter2, 0, UITER_LIMIT);
178    if(pos2<0) {
179        log_err("%s->move(limit) failed\n", n2);
180        return;
181    }
182    if(!iter2->hasPrevious(iter2)) {
183        log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
184        return;
185    }
186
187    do {
188        c1=iter1->previous(iter1);
189        c2=iter2->previous(iter2);
190        if(c1!=c2) {
191            log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
192            return;
193        }
194    } while(c1>=0);
195
196    if(iter1->hasPrevious(iter1)) {
197        log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
198        return;
199    }
200    if(iter2->hasPrevious(iter2)) {
201        log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
202        return;
203    }
204}
205
206/*
207 * Test the iterator's getState() and setState() functions.
208 * iter1 and iter2 must be set up for the same iterator type and the same string
209 * but may be physically different structs (different addresses).
210 *
211 * Assume that the text is not empty and that
212 * iteration start==0 and iteration limit==length.
213 * It must be 2<=middle<=length-2.
214 */
215static void
216testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
217    UChar32 u[4];
218
219    UErrorCode errorCode;
220    UChar32 c;
221    uint32_t state;
222    int32_t i, j;
223
224    /* get four UChars from the middle of the string */
225    iter1->move(iter1, middle-2, UITER_ZERO);
226    for(i=0; i<4; ++i) {
227        c=iter1->next(iter1);
228        if(c<0) {
229            /* the test violates the assumptions, see comment above */
230            log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
231            return;
232        }
233        u[i]=c;
234    }
235
236    /* move to the middle and get the state */
237    iter1->move(iter1, -2, UITER_CURRENT);
238    state=uiter_getState(iter1);
239
240    /* set the state into the second iterator and compare the results */
241    errorCode=U_ZERO_ERROR;
242    uiter_setState(iter2, state, &errorCode);
243    if(U_FAILURE(errorCode)) {
244        log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
245        return;
246    }
247
248    c=iter2->current(iter2);
249    if(c!=u[2]) {
250        log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
251    }
252
253    c=iter2->previous(iter2);
254    if(c!=u[1]) {
255        log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
256    }
257
258    iter2->move(iter2, 2, UITER_CURRENT);
259    c=iter2->next(iter2);
260    if(c!=u[3]) {
261        log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
262    }
263
264    iter2->move(iter2, -3, UITER_CURRENT);
265    c=iter2->previous(iter2);
266    if(c!=u[0]) {
267        log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
268    }
269
270    /* move the second iterator back to the middle */
271    iter2->move(iter2, 1, UITER_CURRENT);
272    iter2->next(iter2);
273
274    /* check that both are in the middle */
275    i=iter1->getIndex(iter1, UITER_CURRENT);
276    j=iter2->getIndex(iter2, UITER_CURRENT);
277    if(i!=middle) {
278        log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
279    }
280    if(i!=j) {
281        log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
282    }
283
284    /* compare lengths */
285    i=iter1->getIndex(iter1, UITER_LENGTH);
286    j=iter2->getIndex(iter2, UITER_LENGTH);
287    if(i!=j) {
288        log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
289    }
290}
291
292static void
293TestLenient8Iterator() {
294    static const UChar text[]={
295        0x61, 0x62, 0x63,
296        /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
297        0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
298        0x78, 0x79, 0x7a, 0
299    };
300    static const uint8_t bytes[]={
301        0x61, 0x62, 0x63,
302        /* dffd            107fd                    d801               dffd - mixture */
303        0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
304        0x78, 0x79, 0x7a, 0
305    };
306
307    UCharIterator iter1, iter2;
308    UChar32 c1, c2;
309    int32_t length;
310
311    puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
312
313    /* compare the same string between UTF-16 and lenient-8 UCharIterators */
314    uiter_setString(&iter1, text, -1);
315    uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
316    compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
317
318    /* try again with length=-1 */
319    uiter_setLenient8(&iter2, (const char *)bytes, -1);
320    compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
321
322    /* test get/set state */
323    length=LENGTHOF(text)-1;
324    uiter_setLenient8(&iter1, (const char*)bytes, -1);
325    testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
326    testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
327
328    /* ---------------------------------------------------------------------- */
329
330    puts("no output so far means that the lenient-8 iterator works fine");
331
332    puts("iterate forward:\nUTF-16\tlenient-8");
333    uiter_setString(&iter1, text, -1);
334    iter1.move(&iter1, 0, UITER_START);
335    iter2.move(&iter2, 0, UITER_START);
336    for(;;) {
337        c1=iter1.next(&iter1);
338        c2=iter2.next(&iter2);
339        if(c1<0 && c2<0) {
340            break;
341        }
342        if(c1<0) {
343            printf("\t%04x\n", c2);
344        } else if(c2<0) {
345            printf("%04x\n", c1);
346        } else {
347            printf("%04x\t%04x\n", c1, c2);
348        }
349    }
350}
351
352extern int
353main(int argc, const char *argv[]) {
354    TestLenient8Iterator();
355    return 0;
356}
357