ucnvseltst.c revision 85bf2e2fbc60a9f938064abc8127d61da7d19882
1/********************************************************************
2 * Copyright (c) 1997-2009, International Business Machines
3 * Corporation and others. All Rights Reserved.
4 ********************************************************************
5 *
6 * File UCNVSELTST.C
7 *
8 * Modification History:
9 *        Name                     Description
10 *     MOHAMED ELDAWY               Creation
11 ********************************************************************
12 */
13
14/* C API AND FUNCTIONALITY TEST FOR CONVERTER SELECTOR (ucnvsel.h)*/
15
16#include "ucnvseltst.h"
17
18#include <stdio.h>
19
20#include "unicode/utypes.h"
21#include "unicode/ucnvsel.h"
22#include "unicode/ustring.h"
23#include "cmemory.h"
24#include "cstring.h"
25
26#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
27
28#define FILENAME_BUFFER 1024
29
30#define TDSRCPATH  ".." U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING
31
32static void TestSelector(void);
33
34void addCnvSelTest(TestNode** root)
35{
36    addTest(root, &TestSelector, "tsconv/ucnvseltst/TestSelector");
37}
38
39static const char **gAvailableNames = NULL;
40static int32_t gCountAvailable = 0;
41
42static UBool
43getAvailableNames() {
44  int32_t i;
45  if (gAvailableNames != NULL) {
46    return TRUE;
47  }
48  gCountAvailable = ucnv_countAvailable();
49  if (gCountAvailable == 0) {
50    log_data_err("No converters available.\n");
51    return FALSE;
52  }
53  gAvailableNames = (const char **)uprv_malloc(gCountAvailable * sizeof(const char *));
54  if (gAvailableNames == NULL) {
55    log_err("unable to allocate memory for %ld available converter names\n",
56            (long)gCountAvailable);
57    return FALSE;
58  }
59  for (i = 0; i < gCountAvailable; ++i) {
60    gAvailableNames[i] = ucnv_getAvailableName(i);
61  }
62  return TRUE;
63}
64
65static void
66releaseAvailableNames() {
67  uprv_free((void *)gAvailableNames);
68  gAvailableNames = NULL;
69  gCountAvailable = 0;
70}
71
72static const char **
73getEncodings(int32_t start, int32_t step, int32_t count, int32_t *pCount) {
74  const char **names;
75  int32_t i;
76
77  *pCount = 0;
78  if (count <= 0) {
79    return NULL;
80  }
81  names = (const char **)uprv_malloc(count * sizeof(char *));
82  if (names == NULL) {
83    log_err("memory allocation error for %ld pointers\n", (long)count);
84    return NULL;
85  }
86  if (step == 0 && count > 0) {
87    step = 1;
88  }
89  for (i = 0; i < count; ++i) {
90    if (0 <= start && start < gCountAvailable) {
91      names[i] = gAvailableNames[start];
92      start += step;
93      ++*pCount;
94    }
95  }
96  return names;
97}
98
99#if 0
100/*
101 * ucnvsel_open() does not support "no encodings":
102 * Given 0 encodings it will open a selector for all available ones.
103 */
104static const char **
105getNoEncodings(int32_t *pCount) {
106  *pCount = 0;
107  return NULL;
108}
109#endif
110
111static const char **
112getOneEncoding(int32_t *pCount) {
113  return getEncodings(1, 0, 1, pCount);
114}
115
116static const char **
117getFirstEvenEncodings(int32_t *pCount) {
118  return getEncodings(0, 2, 25, pCount);
119}
120
121static const char **
122getMiddleEncodings(int32_t *pCount) {
123  return getEncodings(gCountAvailable - 12, 1, 22, pCount);
124}
125
126static const char **
127getLastEncodings(int32_t *pCount) {
128  return getEncodings(gCountAvailable - 1, -1, 25, pCount);
129}
130
131static const char **
132getSomeEncodings(int32_t *pCount) {
133  /* 20 evenly distributed */
134  return getEncodings(5, (gCountAvailable + 19)/ 20, 20, pCount);
135}
136
137static const char **
138getEveryThirdEncoding(int32_t *pCount) {
139  return getEncodings(2, 3, (gCountAvailable + 2 )/ 3, pCount);
140}
141
142static const char **
143getAllEncodings(int32_t *pCount) {
144  return getEncodings(0, 1, gCountAvailable, pCount);
145}
146
147typedef const char **GetEncodingsFn(int32_t *);
148
149static GetEncodingsFn *const getEncodingsFns[] = {
150  getOneEncoding,
151  getFirstEvenEncodings,
152  getMiddleEncodings,
153  getLastEncodings,
154  getSomeEncodings,
155  getEveryThirdEncoding,
156  getAllEncodings
157};
158
159static FILE *fopenOrError(const char *filename) {
160    int32_t needLen;
161    FILE *f;
162    char fnbuf[FILENAME_BUFFER];
163    const char* directory= ctest_dataSrcDir();
164    needLen = uprv_strlen(directory)+uprv_strlen(TDSRCPATH)+uprv_strlen(filename)+1;
165    if(needLen > FILENAME_BUFFER) {
166        log_err("FAIL: Could not load %s. Filename buffer overflow, needed %d but buffer is %d\n",
167                filename, needLen, FILENAME_BUFFER);
168        return NULL;
169    }
170
171    strcpy(fnbuf, directory);
172    strcat(fnbuf, TDSRCPATH);
173    strcat(fnbuf, filename);
174
175    f = fopen(fnbuf, "rb");
176
177    if(f == NULL) {
178        log_data_err("FAIL: Could not load %s [%s]\n", fnbuf, filename);
179    }
180    return f;
181}
182
183typedef struct TestText {
184  char *text, *textLimit;
185  char *limit;
186  int32_t number;
187} TestText;
188
189static void
190text_reset(TestText *tt) {
191  tt->limit = tt->text;
192  tt->number = 0;
193}
194
195static char *
196text_nextString(TestText *tt, int32_t *pLength) {
197  char *s = tt->limit;
198  if (s == tt->textLimit) {
199    /* we already delivered the last string */
200    return NULL;
201  } else if (s == tt->text) {
202    /* first string */
203    if ((tt->textLimit - tt->text) >= 3 &&
204        s[0] == (char)0xef && s[1] == (char)0xbb && s[2] == (char)0xbf
205    ) {
206      s += 3;  /* skip the UTF-8 signature byte sequence (U+FEFF) */
207    }
208  } else {
209    /* skip the string terminator */
210    ++s;
211    ++tt->number;
212  }
213
214  /* find the end of this string */
215  tt->limit = uprv_strchr(s, 0);
216  *pLength = (int32_t)(tt->limit - s);
217  return s;
218}
219
220static UBool
221text_open(TestText *tt) {
222  FILE *f;
223  char *s;
224  int32_t length;
225  uprv_memset(tt, 0, sizeof(TestText));
226  f = fopenOrError("ConverterSelectorTestUTF8.txt");
227  if(!f) {
228    return FALSE;
229  }
230  fseek(f, 0, SEEK_END);
231  length = (int32_t)ftell(f);
232  fseek(f, 0, SEEK_SET);
233  tt->text = (char *)uprv_malloc(length + 1);
234  if (tt->text == NULL) {
235    fclose(f);
236    return FALSE;
237  }
238  if (length != fread(tt->text, 1, length, f)) {
239    log_err("error reading %ld bytes from test text file\n", (long)length);
240    length = 0;
241    uprv_free(tt->text);
242  }
243  fclose(f);
244  tt->textLimit = tt->text + length;
245  *tt->textLimit = 0;
246  /* replace all Unicode '#' (U+0023) with NUL */
247  for(s = tt->text; (s = uprv_strchr(s, 0x23)) != NULL; *s++ = 0) {}
248  text_reset(tt);
249  return TRUE;
250}
251
252static void
253text_close(TestText *tt) {
254  uprv_free(tt->text);
255}
256
257static int32_t findIndex(const char* converterName) {
258  int32_t i;
259  for (i = 0 ; i < gCountAvailable; i++) {
260    if(ucnv_compareNames(gAvailableNames[i], converterName) == 0) {
261      return i;
262    }
263  }
264  return -1;
265}
266
267static UBool *
268getResultsManually(const char** encodings, int32_t num_encodings,
269                   const char *utf8, int32_t length,
270                   const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
271  UBool* resultsManually;
272  int32_t i;
273
274  resultsManually = (UBool*) uprv_malloc(gCountAvailable);
275  uprv_memset(resultsManually, 0, gCountAvailable);
276
277  for(i = 0 ; i < num_encodings ; i++) {
278    UErrorCode status = U_ZERO_ERROR;
279    /* get unicode set for that converter */
280    USet* set;
281    UConverter* test_converter;
282    UChar32 cp;
283    int32_t encIndex, offset;
284
285    set = uset_openEmpty();
286    test_converter = ucnv_open(encodings[i], &status);
287    ucnv_getUnicodeSet(test_converter, set,
288                       whichSet, &status);
289    if (excludedCodePoints != NULL) {
290      uset_addAll(set, excludedCodePoints);
291    }
292    uset_freeze(set);
293    offset = 0;
294    cp = 0;
295
296    encIndex = findIndex(encodings[i]);
297    /*
298     * The following is almost, but not entirely, the same as
299     * resultsManually[encIndex] =
300     *   (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
301     * They might be different if the set contains strings,
302     * or if the utf8 string contains an illegal sequence.
303     *
304     * The UConverterSelector does not currently handle strings that can be
305     * converted, and it treats an illegal sequence as convertible
306     * while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
307     */
308    resultsManually[encIndex] = TRUE;
309    while(offset<length) {
310      U8_NEXT(utf8, offset, length, cp);
311      if (cp >= 0 && !uset_contains(set, cp)) {
312        resultsManually[encIndex] = FALSE;
313        break;
314      }
315    }
316    uset_close(set);
317    ucnv_close(test_converter);
318  }
319  return resultsManually;
320}
321
322/* closes res but does not free resultsManually */
323static void verifyResult(UEnumeration* res, const UBool *resultsManually) {
324  UBool* resultsFromSystem = (UBool*) uprv_malloc(gCountAvailable * sizeof(UBool));
325  const char* name;
326  UErrorCode status = U_ZERO_ERROR;
327  int32_t i;
328
329  /* fill the bool for the selector results! */
330  uprv_memset(resultsFromSystem, 0, gCountAvailable);
331  while ((name = uenum_next(res,NULL, &status)) != NULL) {
332    resultsFromSystem[findIndex(name)] = TRUE;
333  }
334  for(i = 0 ; i < gCountAvailable; i++) {
335    if(resultsManually[i] != resultsFromSystem[i]) {
336      log_err("failure in converter selector\n"
337              "converter %s had conflicting results -- manual: %d, system %d\n",
338              gAvailableNames[i], resultsManually[i], resultsFromSystem[i]);
339    }
340  }
341  uprv_free(resultsFromSystem);
342  uenum_close(res);
343}
344
345static UConverterSelector *
346serializeAndUnserialize(UConverterSelector *sel, char **buffer, UErrorCode *status) {
347  char *new_buffer;
348  int32_t ser_len, ser_len2;
349  /* preflight */
350  ser_len = ucnvsel_serialize(sel, NULL, 0, status);
351  if (*status != U_BUFFER_OVERFLOW_ERROR) {
352    log_err("ucnvsel_serialize(preflighting) failed: %s\n", u_errorName(*status));
353    return sel;
354  }
355  new_buffer = (char *)uprv_malloc(ser_len);
356  *status = U_ZERO_ERROR;
357  ser_len2 = ucnvsel_serialize(sel, new_buffer, ser_len, status);
358  if (U_FAILURE(*status) || ser_len != ser_len2) {
359    log_err("ucnvsel_serialize() failed: %s\n", u_errorName(*status));
360    uprv_free(new_buffer);
361    return sel;
362  }
363  ucnvsel_close(sel);
364  uprv_free(*buffer);
365  *buffer = new_buffer;
366  sel = ucnvsel_openFromSerialized(new_buffer, ser_len, status);
367  if (U_FAILURE(*status)) {
368    log_err("ucnvsel_openFromSerialized() failed: %s\n", u_errorName(*status));
369    return NULL;
370  }
371  return sel;
372}
373
374static void TestSelector()
375{
376  TestText text;
377  USet* excluded_sets[3] = { NULL };
378  int32_t i, testCaseIdx;
379
380  if (!getAvailableNames()) {
381    return;
382  }
383  if (!text_open(&text)) {
384    releaseAvailableNames();;
385  }
386
387  excluded_sets[0] = uset_openEmpty();
388  for(i = 1 ; i < 3 ; i++) {
389    excluded_sets[i] = uset_open(i*30, i*30+500);
390  }
391
392  for(testCaseIdx = 0; testCaseIdx < LENGTHOF(getEncodingsFns); testCaseIdx++)
393  {
394    int32_t excluded_set_id;
395    int32_t num_encodings;
396    const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings);
397    if (QUICK && num_encodings > 25) {
398      uprv_free((void *)encodings);
399      continue;
400    }
401
402    /*
403     * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
404     *
405     * This loop was replaced by the following statement because
406     * the loop made the test run longer without adding to the code coverage.
407     * The handling of the exclusion set is independent of the
408     * set of encodings, so there is no need to test every combination.
409     */
410    excluded_set_id = testCaseIdx % LENGTHOF(excluded_sets);
411    {
412      UConverterSelector *sel_rt, *sel_fb;
413      char *buffer_fb = NULL;
414      UErrorCode status = U_ZERO_ERROR;
415      sel_rt = ucnvsel_open(encodings, num_encodings,
416                            excluded_sets[excluded_set_id],
417                            UCNV_ROUNDTRIP_SET, &status);
418      if (num_encodings == gCountAvailable) {
419        /* test the special "all converters" parameter values */
420        sel_fb = ucnvsel_open(NULL, 0,
421                              excluded_sets[excluded_set_id],
422                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
423      } else if (uset_isEmpty(excluded_sets[excluded_set_id])) {
424        /* test that a NULL set gives the same results as an empty set */
425        sel_fb = ucnvsel_open(encodings, num_encodings,
426                              NULL,
427                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
428      } else {
429        sel_fb = ucnvsel_open(encodings, num_encodings,
430                              excluded_sets[excluded_set_id],
431                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
432      }
433      if (U_FAILURE(status)) {
434        log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status));
435        ucnvsel_close(sel_rt);
436        uprv_free((void *)encodings);
437        continue;
438      }
439
440      text_reset(&text);
441      for (;;) {
442        UBool *manual_rt, *manual_fb;
443        static UChar utf16[10000];
444        char *s;
445        int32_t length8, length16;
446
447        s = text_nextString(&text, &length8);
448        if (s == NULL || (QUICK && text.number > 3)) {
449          break;
450        }
451
452        manual_rt = getResultsManually(encodings, num_encodings,
453                                       s, length8,
454                                       excluded_sets[excluded_set_id],
455                                       UCNV_ROUNDTRIP_SET);
456        manual_fb = getResultsManually(encodings, num_encodings,
457                                       s, length8,
458                                       excluded_sets[excluded_set_id],
459                                       UCNV_ROUNDTRIP_AND_FALLBACK_SET);
460        /* UTF-8 with length */
461        status = U_ZERO_ERROR;
462        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt);
463        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb);
464        /* UTF-8 NUL-terminated */
465        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt);
466        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb);
467
468        u_strFromUTF8(utf16, LENGTHOF(utf16), &length16, s, length8, &status);
469        if (U_FAILURE(status)) {
470          log_err("error converting the test text (string %ld) to UTF-16 - %s\n",
471                  (long)text.number, u_errorName(status));
472        } else {
473          if (text.number == 0) {
474            sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status);
475          }
476          if (U_SUCCESS(status)) {
477            /* UTF-16 with length */
478            verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt);
479            verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb);
480            /* UTF-16 NUL-terminated */
481            verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt);
482            verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb);
483          }
484        }
485
486        uprv_free(manual_rt);
487        uprv_free(manual_fb);
488      }
489      ucnvsel_close(sel_rt);
490      ucnvsel_close(sel_fb);
491      uprv_free(buffer_fb);
492    }
493    uprv_free((void *)encodings);
494  }
495
496  releaseAvailableNames();
497  text_close(&text);
498  for(i = 0 ; i < 3 ; i++) {
499    uset_close(excluded_sets[i]);
500  }
501}
502