1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2006, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************** 6 * 7 * @author Mark E. Davis 8 * @author Vladimir Weinstein 9 */ 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_NORMALIZATION 14 15#include "intltest.h" 16#include "cstring.h" 17#include "canittst.h" 18#include "unicode/caniter.h" 19#include "unicode/normlzr.h" 20#include "unicode/uchar.h" 21#include "hash.h" 22 23#define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array))) 24 25#define CASE(id,test) case id: \ 26 name = #test; \ 27 if (exec) { \ 28 logln(#test "---"); \ 29 logln((UnicodeString)""); \ 30 test(); \ 31 } \ 32 break 33 34void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, 35 const char* &name, char* /*par*/) { 36 switch (index) { 37 CASE(0, TestBasic); 38 CASE(1, TestExhaustive); 39 CASE(2, TestAPI); 40 default: name = ""; break; 41 } 42} 43 44/** 45 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects 46static UnicodeString str(const char *input) 47{ 48 UnicodeString str(input, ""); // Invariant conversion 49 return str.unescape(); 50} 51 */ 52 53 54CanonicalIteratorTest::CanonicalIteratorTest() : 55nameTrans(NULL), hexTrans(NULL) 56{ 57} 58 59CanonicalIteratorTest::~CanonicalIteratorTest() 60{ 61#if !UCONFIG_NO_TRANSLITERATION 62 if(nameTrans != NULL) { 63 delete(nameTrans); 64 } 65 if(hexTrans != NULL) { 66 delete(hexTrans); 67 } 68#endif 69} 70 71void CanonicalIteratorTest::TestExhaustive() { 72 UErrorCode status = U_ZERO_ERROR; 73 CanonicalIterator it("", status); 74 UChar32 i = 0; 75 UnicodeString s; 76 // Test static and dynamic class IDs 77 if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 78 errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); 79 } 80 for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { 81 //for (i = 0xae00; i < 0xaf00; ++i) { 82 83 if ((i % 0x100) == 0) { 84 logln("Testing U+%06X", i); 85 } 86 87 // skip characters we know don't have decomps 88 int8_t type = u_charType(i); 89 if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR 90 || type == U_SURROGATE) continue; 91 92 s = i; 93 characterTest(s, i, it); 94 95 s += (UChar32)0x0345; //"\\u0345"; 96 characterTest(s, i, it); 97 } 98} 99 100void CanonicalIteratorTest::TestBasic() { 101 102 UErrorCode status = U_ZERO_ERROR; 103 104 static const char * const testArray[][2] = { 105 {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " 106 "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " 107 "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " 108 "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, 109 {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, 110 {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, 111 }; 112 113#if 0 114 // This is not interesting for C/C++ as the data is already built beforehand 115 // check build 116 UnicodeSet ss = CanonicalIterator.getSafeStart(); 117 logln("Safe Start: " + ss.toPattern(true)); 118 ss = CanonicalIterator.getStarts('a'); 119 expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), 120 new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" 121 + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") 122 ); 123#endif 124 125 // check permute 126 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! 127 128 Hashtable *permutations = new Hashtable(FALSE, status); 129 permutations->setValueDeleter(uhash_deleteUnicodeString); 130 UnicodeString toPermute("ABC"); 131 132 CanonicalIterator::permute(toPermute, FALSE, permutations, status); 133 134 logln("testing permutation"); 135 136 expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); 137 138 delete permutations; 139 140 // try samples 141 logln("testing samples"); 142 Hashtable *set = new Hashtable(FALSE, status); 143 set->setValueDeleter(uhash_deleteUnicodeString); 144 int32_t i = 0; 145 CanonicalIterator it("", status); 146 if(U_SUCCESS(status)) { 147 for (i = 0; i < ARRAY_LENGTH(testArray); ++i) { 148 //logln("Results for: " + name.transliterate(testArray[i])); 149 UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); 150 it.setSource(testStr, status); 151 set->removeAll(); 152 for (;;) { 153 //UnicodeString *result = new UnicodeString(it.next()); 154 UnicodeString result(it.next()); 155 if (result.isBogus()) { 156 break; 157 } 158 set->put(result, new UnicodeString(result), status); // Add result to the table 159 //logln(++counter + ": " + hex.transliterate(result)); 160 //logln(" = " + name.transliterate(result)); 161 } 162 expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); 163 164 } 165 } else { 166 errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); 167 } 168 delete set; 169} 170 171void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) 172{ 173 UErrorCode status = U_ZERO_ERROR; 174 UnicodeString decomp, comp; 175 UBool gotDecomp = FALSE; 176 UBool gotComp = FALSE; 177 UBool gotSource = FALSE; 178 179 Normalizer::decompose(s, FALSE, 0, decomp, status); 180 Normalizer::compose(s, FALSE, 0, comp, status); 181 182 // skip characters that don't have either decomp. 183 // need quick test for this! 184 if (s == decomp && s == comp) { 185 return; 186 } 187 188 it.setSource(s, status); 189 190 for (;;) { 191 UnicodeString item = it.next(); 192 if (item.isBogus()) break; 193 if (item == s) gotSource = TRUE; 194 if (item == decomp) gotDecomp = TRUE; 195 if (item == comp) gotComp = TRUE; 196 } 197 198 if (!gotSource || !gotDecomp || !gotComp) { 199 errln("FAIL CanonicalIterator: " + s + (int)ch); 200 } 201} 202 203void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { 204 if (!(a==b)) { 205 errln("FAIL: " + message + getReadable(item)); 206 errln("\t" + getReadable(a)); 207 errln("\t" + getReadable(b)); 208 } else { 209 logln("Checked: " + message + getReadable(item)); 210 logln("\t" + getReadable(a)); 211 logln("\t" + getReadable(b)); 212 } 213} 214 215UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { 216 UErrorCode status = U_ZERO_ERROR; 217 UnicodeString result = "["; 218 if (s.length() == 0) return ""; 219 // set up for readable display 220#if !UCONFIG_NO_TRANSLITERATION 221 if(verbose) { 222 if (nameTrans == NULL) 223 nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); 224 UnicodeString sName = s; 225 nameTrans->transliterate(sName); 226 result += sName; 227 result += ";"; 228 } 229 if (hexTrans == NULL) 230 hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); 231#endif 232 UnicodeString sHex = s; 233#if !UCONFIG_NO_TRANSLITERATION 234 if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated 235 hexTrans->transliterate(sHex); 236 } 237#endif 238 result += sHex; 239 result += "]"; 240 return result; 241 //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; 242} 243 244U_CFUNC int U_CALLCONV 245compareUnicodeStrings(const void *s1, const void *s2) { 246 UnicodeString **st1 = (UnicodeString **)s1; 247 UnicodeString **st2 = (UnicodeString **)s2; 248 249 return (*st1)->compare(**st2); 250} 251 252 253UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { 254 UnicodeString result; 255 256 // Iterate over the Hashtable, then qsort. 257 258 UnicodeString **resArray = new UnicodeString*[col->count()]; 259 int32_t i = 0; 260 261 const UHashElement *ne = NULL; 262 int32_t el = -1; 263 //Iterator it = basic.iterator(); 264 ne = col->nextElement(el); 265 //while (it.hasNext()) 266 while (ne != NULL) { 267 //String item = (String) it.next(); 268 UnicodeString *item = (UnicodeString *)(ne->value.pointer); 269 resArray[i++] = item; 270 ne = col->nextElement(el); 271 } 272 273 for(i = 0; i<col->count(); ++i) { 274 logln(*resArray[i]); 275 } 276 277 qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); 278 279 result = *resArray[0]; 280 281 for(i = 1; i<col->count(); ++i) { 282 result += ", "; 283 result += *resArray[i]; 284 } 285 286/* 287 Iterator it = col.iterator(); 288 while (it.hasNext()) { 289 if (result.length() != 0) result.append(", "); 290 result.append(it.next().toString()); 291 } 292*/ 293 294 delete [] resArray; 295 296 return result; 297} 298 299void CanonicalIteratorTest::TestAPI() { 300 UErrorCode status = U_ZERO_ERROR; 301 // Test reset and getSource 302 UnicodeString start("ljubav"); 303 logln("Testing CanonicalIterator::getSource"); 304 logln("Instantiating canonical iterator with string "+start); 305 CanonicalIterator can(start, status); 306 UnicodeString source = can.getSource(); 307 logln("CanonicalIterator::getSource returned "+source); 308 if(start != source) { 309 errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); 310 } 311 logln("Testing CanonicalIterator::reset"); 312 UnicodeString next = can.next(); 313 logln("CanonicalIterator::next returned "+next); 314 315 can.reset(); 316 317 UnicodeString afterReset = can.next(); 318 logln("After reset, CanonicalIterator::next returned "+afterReset); 319 320 if(next != afterReset) { 321 errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); 322 } 323 324 logln("Testing getStaticClassID and getDynamicClassID"); 325 if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 326 errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); 327 } 328} 329 330#endif /* #if !UCONFIG_NO_NORMALIZATION */ 331