ucnvseltst.c revision b0ac937921a2c196d8b9da665135bf6ba01a1ccf
1/********************************************************************
2 * Copyright (c) 1997-2009, International Business Machines
3 * Corporation and others. All Rights Reserved.
4 ********************************************************************
5 *
6 * File UCNVSELTST.C
7 *
8 * Modification History:
9 *        Name                     Description
10 *     MOHAMED ELDAWY               Creation
11 ********************************************************************
12 */
13
14/* C API AND FUNCTIONALITY TEST FOR CONVERTER SELECTOR (ucnvsel.h)*/
15
16#include "ucnvseltst.h"
17
18#include <stdio.h>
19
20#include "unicode/utypes.h"
21#include "unicode/ucnvsel.h"
22#include "unicode/ustring.h"
23#include "cmemory.h"
24#include "cstring.h"
25
26#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
27
28#define FILENAME_BUFFER 1024
29
30#define TDSRCPATH  ".." U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING
31
32static void TestSelector(void);
33
34void addCnvSelTest(TestNode** root)
35{
36    addTest(root, &TestSelector, "tsconv/ucnvseltst/TestSelector");
37}
38
39static const char **gAvailableNames = NULL;
40static int32_t gCountAvailable = 0;
41
42static UBool
43getAvailableNames() {
44  int32_t i;
45  if (gAvailableNames != NULL) {
46    return TRUE;
47  }
48  gCountAvailable = ucnv_countAvailable();
49  gAvailableNames = (const char **)uprv_malloc(gCountAvailable * sizeof(const char *));
50  if (gAvailableNames == NULL) {
51    log_err("unable to allocate memory for %ld available converter names\n",
52            (long)gCountAvailable);
53    return FALSE;
54  }
55  for (i = 0; i < gCountAvailable; ++i) {
56    gAvailableNames[i] = ucnv_getAvailableName(i);
57  }
58  return TRUE;
59}
60
61static void
62releaseAvailableNames() {
63  uprv_free((void *)gAvailableNames);
64  gAvailableNames = NULL;
65  gCountAvailable = 0;
66}
67
68static const char **
69getEncodings(int32_t start, int32_t step, int32_t count, int32_t *pCount) {
70  const char **names;
71  int32_t i;
72
73  *pCount = 0;
74  if (count <= 0) {
75    return NULL;
76  }
77  names = (const char **)uprv_malloc(count * sizeof(char *));
78  if (names == NULL) {
79    log_err("memory allocation error for %ld pointers\n", (long)count);
80    return NULL;
81  }
82  if (step == 0 && count > 0) {
83    step = 1;
84  }
85  for (i = 0; i < count; ++i) {
86    if (0 <= start && start < gCountAvailable) {
87      names[i] = gAvailableNames[start];
88      start += step;
89      ++*pCount;
90    }
91  }
92  return names;
93}
94
95#if 0
96/*
97 * ucnvsel_open() does not support "no encodings":
98 * Given 0 encodings it will open a selector for all available ones.
99 */
100static const char **
101getNoEncodings(int32_t *pCount) {
102  *pCount = 0;
103  return NULL;
104}
105#endif
106
107static const char **
108getOneEncoding(int32_t *pCount) {
109  return getEncodings(1, 0, 1, pCount);
110}
111
112static const char **
113getFirstEvenEncodings(int32_t *pCount) {
114  return getEncodings(0, 2, 25, pCount);
115}
116
117static const char **
118getMiddleEncodings(int32_t *pCount) {
119  return getEncodings(gCountAvailable - 12, 1, 22, pCount);
120}
121
122static const char **
123getLastEncodings(int32_t *pCount) {
124  return getEncodings(gCountAvailable - 1, -1, 25, pCount);
125}
126
127static const char **
128getSomeEncodings(int32_t *pCount) {
129  /* 20 evenly distributed */
130  return getEncodings(5, (gCountAvailable + 19)/ 20, 20, pCount);
131}
132
133static const char **
134getEveryThirdEncoding(int32_t *pCount) {
135  return getEncodings(2, 3, (gCountAvailable + 2 )/ 3, pCount);
136}
137
138static const char **
139getAllEncodings(int32_t *pCount) {
140  return getEncodings(0, 1, gCountAvailable, pCount);
141}
142
143typedef const char **GetEncodingsFn(int32_t *);
144
145static GetEncodingsFn *const getEncodingsFns[] = {
146  getOneEncoding,
147  getFirstEvenEncodings,
148  getMiddleEncodings,
149  getLastEncodings,
150  getSomeEncodings,
151  getEveryThirdEncoding,
152  getAllEncodings
153};
154
155static FILE *fopenOrError(const char *filename) {
156    int32_t needLen;
157    FILE *f;
158    char fnbuf[FILENAME_BUFFER];
159    const char* directory= ctest_dataSrcDir();
160    needLen = uprv_strlen(directory)+uprv_strlen(TDSRCPATH)+uprv_strlen(filename)+1;
161    if(needLen > FILENAME_BUFFER) {
162        log_err("FAIL: Could not load %s. Filename buffer overflow, needed %d but buffer is %d\n",
163                filename, needLen, FILENAME_BUFFER);
164        return NULL;
165    }
166
167    strcpy(fnbuf, directory);
168    strcat(fnbuf, TDSRCPATH);
169    strcat(fnbuf, filename);
170
171    f = fopen(fnbuf, "rb");
172
173    if(f == NULL) {
174        log_data_err("FAIL: Could not load %s [%s]\n", fnbuf, filename);
175    }
176    return f;
177}
178
179typedef struct TestText {
180  char *text, *textLimit;
181  char *limit;
182  int32_t number;
183} TestText;
184
185static void
186text_reset(TestText *tt) {
187  tt->limit = tt->text;
188  tt->number = 0;
189}
190
191static char *
192text_nextString(TestText *tt, int32_t *pLength) {
193  char *s = tt->limit;
194  if (s == tt->textLimit) {
195    /* we already delivered the last string */
196    return NULL;
197  } else if (s == tt->text) {
198    /* first string */
199    if ((tt->textLimit - tt->text) >= 3 &&
200        s[0] == (char)0xef && s[1] == (char)0xbb && s[2] == (char)0xbf
201    ) {
202      s += 3;  /* skip the UTF-8 signature byte sequence (U+FEFF) */
203    }
204  } else {
205    /* skip the string terminator */
206    ++s;
207    ++tt->number;
208  }
209
210  /* find the end of this string */
211  tt->limit = uprv_strchr(s, 0);
212  *pLength = (int32_t)(tt->limit - s);
213  return s;
214}
215
216static UBool
217text_open(TestText *tt) {
218  FILE *f;
219  char *s;
220  int32_t length;
221  uprv_memset(tt, 0, sizeof(TestText));
222  f = fopenOrError("ConverterSelectorTestUTF8.txt");
223  if(!f) {
224    return FALSE;
225  }
226  fseek(f, 0, SEEK_END);
227  length = (int32_t)ftell(f);
228  fseek(f, 0, SEEK_SET);
229  tt->text = (char *)uprv_malloc(length + 1);
230  if (tt->text == NULL) {
231    fclose(f);
232    return FALSE;
233  }
234  if (length != fread(tt->text, 1, length, f)) {
235    log_err("error reading %ld bytes from test text file\n", (long)length);
236    length = 0;
237    uprv_free(tt->text);
238  }
239  fclose(f);
240  tt->textLimit = tt->text + length;
241  *tt->textLimit = 0;
242  /* replace all Unicode '#' (U+0023) with NUL */
243  for(s = tt->text; (s = uprv_strchr(s, 0x23)) != NULL; *s++ = 0) {}
244  text_reset(tt);
245  return TRUE;
246}
247
248static void
249text_close(TestText *tt) {
250  uprv_free(tt->text);
251}
252
253static int32_t findIndex(const char* converterName) {
254  int32_t i;
255  for (i = 0 ; i < gCountAvailable; i++) {
256    if(ucnv_compareNames(gAvailableNames[i], converterName) == 0) {
257      return i;
258    }
259  }
260  return -1;
261}
262
263static UBool *
264getResultsManually(const char** encodings, int32_t num_encodings,
265                   const char *utf8, int32_t length,
266                   const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
267  UBool* resultsManually;
268  int32_t i;
269
270  resultsManually = (UBool*) uprv_malloc(gCountAvailable);
271  uprv_memset(resultsManually, 0, gCountAvailable);
272
273  for(i = 0 ; i < num_encodings ; i++) {
274    UErrorCode status = U_ZERO_ERROR;
275    /* get unicode set for that converter */
276    USet* set;
277    UConverter* test_converter;
278    UChar32 cp;
279    int32_t encIndex, offset;
280
281    set = uset_openEmpty();
282    test_converter = ucnv_open(encodings[i], &status);
283    ucnv_getUnicodeSet(test_converter, set,
284                       whichSet, &status);
285    if (excludedCodePoints != NULL) {
286      uset_addAll(set, excludedCodePoints);
287    }
288    uset_freeze(set);
289    offset = 0;
290    cp = 0;
291
292    encIndex = findIndex(encodings[i]);
293    /*
294     * The following is almost, but not entirely, the same as
295     * resultsManually[encIndex] =
296     *   (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
297     * They might be different if the set contains strings,
298     * or if the utf8 string contains an illegal sequence.
299     *
300     * The UConverterSelector does not currently handle strings that can be
301     * converted, and it treats an illegal sequence as convertible
302     * while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
303     */
304    resultsManually[encIndex] = TRUE;
305    while(offset<length) {
306      U8_NEXT(utf8, offset, length, cp);
307      if (cp >= 0 && !uset_contains(set, cp)) {
308        resultsManually[encIndex] = FALSE;
309        break;
310      }
311    }
312    uset_close(set);
313    ucnv_close(test_converter);
314  }
315  return resultsManually;
316}
317
318/* closes res but does not free resultsManually */
319static void verifyResult(UEnumeration* res, const UBool *resultsManually) {
320  UBool* resultsFromSystem = (UBool*) uprv_malloc(gCountAvailable * sizeof(UBool));
321  const char* name;
322  UErrorCode status = U_ZERO_ERROR;
323  int32_t i;
324
325  /* fill the bool for the selector results! */
326  uprv_memset(resultsFromSystem, 0, gCountAvailable);
327  while ((name = uenum_next(res,NULL, &status)) != NULL) {
328    resultsFromSystem[findIndex(name)] = TRUE;
329  }
330  for(i = 0 ; i < gCountAvailable; i++) {
331    if(resultsManually[i] != resultsFromSystem[i]) {
332      log_err("failure in converter selector\n"
333              "converter %s had conflicting results -- manual: %d, system %d\n",
334              gAvailableNames[i], resultsManually[i], resultsFromSystem[i]);
335    }
336  }
337  uprv_free(resultsFromSystem);
338  uenum_close(res);
339}
340
341static UConverterSelector *
342serializeAndUnserialize(UConverterSelector *sel, char **buffer, UErrorCode *status) {
343  char *new_buffer;
344  int32_t ser_len, ser_len2;
345  /* preflight */
346  ser_len = ucnvsel_serialize(sel, NULL, 0, status);
347  if (*status != U_BUFFER_OVERFLOW_ERROR) {
348    log_err("ucnvsel_serialize(preflighting) failed: %s\n", u_errorName(*status));
349    return sel;
350  }
351  new_buffer = (char *)uprv_malloc(ser_len);
352  *status = U_ZERO_ERROR;
353  ser_len2 = ucnvsel_serialize(sel, new_buffer, ser_len, status);
354  if (U_FAILURE(*status) || ser_len != ser_len2) {
355    log_err("ucnvsel_serialize() failed: %s\n", u_errorName(*status));
356    uprv_free(new_buffer);
357    return sel;
358  }
359  ucnvsel_close(sel);
360  uprv_free(*buffer);
361  *buffer = new_buffer;
362  sel = ucnvsel_openFromSerialized(new_buffer, ser_len, status);
363  if (U_FAILURE(*status)) {
364    log_err("ucnvsel_openFromSerialized() failed: %s\n", u_errorName(*status));
365    return NULL;
366  }
367  return sel;
368}
369
370static void TestSelector()
371{
372  TestText text;
373  USet* excluded_sets[3] = { NULL };
374  int32_t i, testCaseIdx;
375
376  if (!getAvailableNames()) {
377    return;
378  }
379  if (!text_open(&text)) {
380    releaseAvailableNames();;
381  }
382
383  excluded_sets[0] = uset_openEmpty();
384  for(i = 1 ; i < 3 ; i++) {
385    excluded_sets[i] = uset_open(i*30, i*30+500);
386  }
387
388  for(testCaseIdx = 0; testCaseIdx < LENGTHOF(getEncodingsFns); testCaseIdx++)
389  {
390    int32_t excluded_set_id;
391    int32_t num_encodings;
392    const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings);
393    if (QUICK && num_encodings > 25) {
394      uprv_free((void *)encodings);
395      continue;
396    }
397
398    /*
399     * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
400     *
401     * This loop was replaced by the following statement because
402     * the loop made the test run longer without adding to the code coverage.
403     * The handling of the exclusion set is independent of the
404     * set of encodings, so there is no need to test every combination.
405     */
406    excluded_set_id = testCaseIdx % LENGTHOF(excluded_sets);
407    {
408      UConverterSelector *sel_rt, *sel_fb;
409      char *buffer_fb = NULL;
410      UErrorCode status = U_ZERO_ERROR;
411      sel_rt = ucnvsel_open(encodings, num_encodings,
412                            excluded_sets[excluded_set_id],
413                            UCNV_ROUNDTRIP_SET, &status);
414      if (num_encodings == gCountAvailable) {
415        /* test the special "all converters" parameter values */
416        sel_fb = ucnvsel_open(NULL, 0,
417                              excluded_sets[excluded_set_id],
418                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
419      } else if (uset_isEmpty(excluded_sets[excluded_set_id])) {
420        /* test that a NULL set gives the same results as an empty set */
421        sel_fb = ucnvsel_open(encodings, num_encodings,
422                              NULL,
423                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
424      } else {
425        sel_fb = ucnvsel_open(encodings, num_encodings,
426                              excluded_sets[excluded_set_id],
427                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
428      }
429      if (U_FAILURE(status)) {
430        log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status));
431        ucnvsel_close(sel_rt);
432        uprv_free((void *)encodings);
433        continue;
434      }
435
436      text_reset(&text);
437      for (;;) {
438        UBool *manual_rt, *manual_fb;
439        static UChar utf16[10000];
440        char *s;
441        int32_t length8, length16;
442
443        s = text_nextString(&text, &length8);
444        if (s == NULL || (QUICK && text.number > 3)) {
445          break;
446        }
447
448        manual_rt = getResultsManually(encodings, num_encodings,
449                                       s, length8,
450                                       excluded_sets[excluded_set_id],
451                                       UCNV_ROUNDTRIP_SET);
452        manual_fb = getResultsManually(encodings, num_encodings,
453                                       s, length8,
454                                       excluded_sets[excluded_set_id],
455                                       UCNV_ROUNDTRIP_AND_FALLBACK_SET);
456        /* UTF-8 with length */
457        status = U_ZERO_ERROR;
458        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt);
459        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb);
460        /* UTF-8 NUL-terminated */
461        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt);
462        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb);
463
464        u_strFromUTF8(utf16, LENGTHOF(utf16), &length16, s, length8, &status);
465        if (U_FAILURE(status)) {
466          log_err("error converting the test text (string %ld) to UTF-16 - %s\n",
467                  (long)text.number, u_errorName(status));
468        } else {
469          if (text.number == 0) {
470            sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status);
471          }
472          if (U_SUCCESS(status)) {
473            /* UTF-16 with length */
474            verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt);
475            verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb);
476            /* UTF-16 NUL-terminated */
477            verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt);
478            verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb);
479          }
480        }
481
482        uprv_free(manual_rt);
483        uprv_free(manual_fb);
484      }
485      ucnvsel_close(sel_rt);
486      ucnvsel_close(sel_fb);
487      uprv_free(buffer_fb);
488    }
489    uprv_free((void *)encodings);
490  }
491
492  releaseAvailableNames();
493  text_close(&text);
494  for(i = 0 ; i < 3 ; i++) {
495    uset_close(excluded_sets[i]);
496  }
497}
498