1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************
8 *
9 * @author Mark E. Davis
10 * @author Vladimir Weinstein
11 */
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_NORMALIZATION
16
17#include "intltest.h"
18#include "cmemory.h"
19#include "cstring.h"
20#include "canittst.h"
21#include "unicode/caniter.h"
22#include "unicode/normlzr.h"
23#include "unicode/uchar.h"
24#include "hash.h"
25
26#define CASE(id,test) case id:                          \
27                          name = #test;                 \
28                          if (exec) {                   \
29                              logln(#test "---");       \
30                              logln((UnicodeString)""); \
31                              test();                   \
32                          }                             \
33                          break
34
35void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
36                                         const char* &name, char* /*par*/) {
37    switch (index) {
38        CASE(0, TestBasic);
39        CASE(1, TestExhaustive);
40        CASE(2, TestAPI);
41      default: name = ""; break;
42    }
43}
44
45/**
46 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
47static UnicodeString str(const char *input)
48{
49    UnicodeString str(input, ""); // Invariant conversion
50    return str.unescape();
51}
52 */
53
54
55CanonicalIteratorTest::CanonicalIteratorTest() :
56nameTrans(NULL), hexTrans(NULL)
57{
58}
59
60CanonicalIteratorTest::~CanonicalIteratorTest()
61{
62#if !UCONFIG_NO_TRANSLITERATION
63  if(nameTrans != NULL) {
64    delete(nameTrans);
65  }
66  if(hexTrans != NULL) {
67    delete(hexTrans);
68  }
69#endif
70}
71
72void CanonicalIteratorTest::TestExhaustive() {
73    UErrorCode status = U_ZERO_ERROR;
74    CanonicalIterator it("", status);
75    if (U_FAILURE(status)) {
76        dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
77        return;
78    }
79    UChar32 i = 0;
80    UnicodeString s;
81    // Test static and dynamic class IDs
82    if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
83        errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
84    }
85    for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
86        //for (i = 0xae00; i < 0xaf00; ++i) {
87
88        if ((i % 0x100) == 0) {
89            logln("Testing U+%06X", i);
90        }
91
92        // skip characters we know don't have decomps
93        int8_t type = u_charType(i);
94        if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
95            || type == U_SURROGATE) continue;
96
97        s = i;
98        characterTest(s, i, it);
99
100        s += (UChar32)0x0345; //"\\u0345";
101        characterTest(s, i, it);
102    }
103}
104
105void CanonicalIteratorTest::TestBasic() {
106
107    UErrorCode status = U_ZERO_ERROR;
108
109    static const char * const testArray[][2] = {
110        {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
111            "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
112            "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
113            "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
114        {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
115        {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
116    };
117
118#if 0
119    // This is not interesting for C/C++ as the data is already built beforehand
120    // check build
121    UnicodeSet ss = CanonicalIterator.getSafeStart();
122    logln("Safe Start: " + ss.toPattern(true));
123    ss = CanonicalIterator.getStarts('a');
124    expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
125        new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
126        + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
127            );
128#endif
129
130    // check permute
131    // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
132
133    Hashtable *permutations = new Hashtable(FALSE, status);
134    permutations->setValueDeleter(uprv_deleteUObject);
135    UnicodeString toPermute("ABC");
136
137    CanonicalIterator::permute(toPermute, FALSE, permutations, status);
138
139    logln("testing permutation");
140
141    expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
142
143    delete permutations;
144
145    // try samples
146    logln("testing samples");
147    Hashtable *set = new Hashtable(FALSE, status);
148    set->setValueDeleter(uprv_deleteUObject);
149    int32_t i = 0;
150    CanonicalIterator it("", status);
151    if(U_SUCCESS(status)) {
152      for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
153          //logln("Results for: " + name.transliterate(testArray[i]));
154          UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
155          it.setSource(testStr, status);
156          set->removeAll();
157          for (;;) {
158              //UnicodeString *result = new UnicodeString(it.next());
159              UnicodeString result(it.next());
160              if (result.isBogus()) {
161                  break;
162              }
163              set->put(result, new UnicodeString(result), status); // Add result to the table
164              //logln(++counter + ": " + hex.transliterate(result));
165              //logln(" = " + name.transliterate(result));
166          }
167          expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
168
169      }
170    } else {
171      dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
172    }
173    delete set;
174}
175
176void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
177{
178    UErrorCode status = U_ZERO_ERROR;
179    UnicodeString decomp, comp;
180    UBool gotDecomp = FALSE;
181    UBool gotComp = FALSE;
182    UBool gotSource = FALSE;
183
184    Normalizer::decompose(s, FALSE, 0, decomp, status);
185    Normalizer::compose(s, FALSE, 0, comp, status);
186
187    // skip characters that don't have either decomp.
188    // need quick test for this!
189    if (s == decomp && s == comp) {
190        return;
191    }
192
193    it.setSource(s, status);
194
195    for (;;) {
196        UnicodeString item = it.next();
197        if (item.isBogus()) break;
198        if (item == s) gotSource = TRUE;
199        if (item == decomp) gotDecomp = TRUE;
200        if (item == comp) gotComp = TRUE;
201    }
202
203    if (!gotSource || !gotDecomp || !gotComp) {
204        errln("FAIL CanonicalIterator: " + s + (int)ch);
205    }
206}
207
208void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
209    if (!(a==b)) {
210        errln("FAIL: " + message + getReadable(item));
211        errln("\t" + getReadable(a));
212        errln("\t" + getReadable(b));
213    } else {
214        logln("Checked: " + message + getReadable(item));
215        logln("\t" + getReadable(a));
216        logln("\t" + getReadable(b));
217    }
218}
219
220UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
221  UErrorCode status = U_ZERO_ERROR;
222  UnicodeString result = "[";
223    if (s.length() == 0) return "";
224    // set up for readable display
225#if !UCONFIG_NO_TRANSLITERATION
226    if(verbose) {
227      if (nameTrans == NULL)
228          nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
229      UnicodeString sName = s;
230      nameTrans->transliterate(sName);
231      result += sName;
232      result += ";";
233    }
234    if (hexTrans == NULL)
235        hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
236#endif
237    UnicodeString sHex = s;
238#if !UCONFIG_NO_TRANSLITERATION
239    if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
240      hexTrans->transliterate(sHex);
241    }
242#endif
243    result += sHex;
244    result += "]";
245    return result;
246    //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
247}
248
249U_CFUNC int U_CALLCONV
250compareUnicodeStrings(const void *s1, const void *s2) {
251  UnicodeString **st1 = (UnicodeString **)s1;
252  UnicodeString **st2 = (UnicodeString **)s2;
253
254  return (*st1)->compare(**st2);
255}
256
257
258UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
259    UnicodeString result;
260
261    // Iterate over the Hashtable, then qsort.
262
263    UnicodeString **resArray = new UnicodeString*[col->count()];
264    int32_t i = 0;
265
266    const UHashElement *ne = NULL;
267    int32_t el = UHASH_FIRST;
268    //Iterator it = basic.iterator();
269    ne = col->nextElement(el);
270    //while (it.hasNext())
271    while (ne != NULL) {
272      //String item = (String) it.next();
273      UnicodeString *item = (UnicodeString *)(ne->value.pointer);
274      resArray[i++] = item;
275      ne = col->nextElement(el);
276    }
277
278    for(i = 0; i<col->count(); ++i) {
279      logln(*resArray[i]);
280    }
281
282    qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
283
284    result = *resArray[0];
285
286    for(i = 1; i<col->count(); ++i) {
287      result += ", ";
288      result += *resArray[i];
289    }
290
291/*
292    Iterator it = col.iterator();
293    while (it.hasNext()) {
294        if (result.length() != 0) result.append(", ");
295        result.append(it.next().toString());
296    }
297*/
298
299    delete [] resArray;
300
301    return result;
302}
303
304void CanonicalIteratorTest::TestAPI() {
305  UErrorCode status = U_ZERO_ERROR;
306  // Test reset and getSource
307  UnicodeString start("ljubav");
308  logln("Testing CanonicalIterator::getSource");
309  logln("Instantiating canonical iterator with string "+start);
310  CanonicalIterator can(start, status);
311  if (U_FAILURE(status)) {
312      dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
313      return;
314  }
315  UnicodeString source = can.getSource();
316  logln("CanonicalIterator::getSource returned "+source);
317  if(start != source) {
318    errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
319  }
320  logln("Testing CanonicalIterator::reset");
321  UnicodeString next = can.next();
322  logln("CanonicalIterator::next returned "+next);
323
324  can.reset();
325
326  UnicodeString afterReset = can.next();
327  logln("After reset, CanonicalIterator::next returned "+afterReset);
328
329  if(next != afterReset) {
330    errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
331  }
332
333  logln("Testing getStaticClassID and getDynamicClassID");
334  if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
335      errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
336  }
337}
338
339#endif /* #if !UCONFIG_NO_NORMALIZATION */
340