1/* 2 ***************************************************************************** 3 * Copyright (C) 1996-2011, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ***************************************************************************** 6 */ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_NORMALIZATION 11 12#include "unicode/caniter.h" 13#include "unicode/normalizer2.h" 14#include "unicode/uchar.h" 15#include "unicode/uniset.h" 16#include "unicode/usetiter.h" 17#include "unicode/ustring.h" 18#include "unicode/utf16.h" 19#include "cmemory.h" 20#include "hash.h" 21#include "normalizer2impl.h" 22 23/** 24 * This class allows one to iterate through all the strings that are canonically equivalent to a given 25 * string. For example, here are some sample results: 26Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 271: \u0041\u030A\u0064\u0307\u0327 28 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 292: \u0041\u030A\u0064\u0327\u0307 30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 313: \u0041\u030A\u1E0B\u0327 32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 334: \u0041\u030A\u1E11\u0307 34 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 355: \u00C5\u0064\u0307\u0327 36 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 376: \u00C5\u0064\u0327\u0307 38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 397: \u00C5\u1E0B\u0327 40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 418: \u00C5\u1E11\u0307 42 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 439: \u212B\u0064\u0307\u0327 44 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 4510: \u212B\u0064\u0327\u0307 46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 4711: \u212B\u1E0B\u0327 48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 4912: \u212B\u1E11\u0307 50 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 51 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 52 * since it has not been optimized for that situation. 53 *@author M. Davis 54 *@draft 55 */ 56 57// public 58 59U_NAMESPACE_BEGIN 60 61// TODO: add boilerplate methods. 62 63UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 64 65/** 66 *@param source string to get results for 67 */ 68CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 69 pieces(NULL), 70 pieces_length(0), 71 pieces_lengths(NULL), 72 current(NULL), 73 current_length(0), 74 nfd(*Normalizer2Factory::getNFDInstance(status)), 75 nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 76{ 77 if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 78 setSource(sourceStr, status); 79 } 80} 81 82CanonicalIterator::~CanonicalIterator() { 83 cleanPieces(); 84} 85 86void CanonicalIterator::cleanPieces() { 87 int32_t i = 0; 88 if(pieces != NULL) { 89 for(i = 0; i < pieces_length; i++) { 90 if(pieces[i] != NULL) { 91 delete[] pieces[i]; 92 } 93 } 94 uprv_free(pieces); 95 pieces = NULL; 96 pieces_length = 0; 97 } 98 if(pieces_lengths != NULL) { 99 uprv_free(pieces_lengths); 100 pieces_lengths = NULL; 101 } 102 if(current != NULL) { 103 uprv_free(current); 104 current = NULL; 105 current_length = 0; 106 } 107} 108 109/** 110 *@return gets the source: NOTE: it is the NFD form of source 111 */ 112UnicodeString CanonicalIterator::getSource() { 113 return source; 114} 115 116/** 117 * Resets the iterator so that one can start again from the beginning. 118 */ 119void CanonicalIterator::reset() { 120 done = FALSE; 121 for (int i = 0; i < current_length; ++i) { 122 current[i] = 0; 123 } 124} 125 126/** 127 *@return the next string that is canonically equivalent. The value null is returned when 128 * the iteration is done. 129 */ 130UnicodeString CanonicalIterator::next() { 131 int32_t i = 0; 132 133 if (done) { 134 buffer.setToBogus(); 135 return buffer; 136 } 137 138 // delete old contents 139 buffer.remove(); 140 141 // construct return value 142 143 for (i = 0; i < pieces_length; ++i) { 144 buffer.append(pieces[i][current[i]]); 145 } 146 //String result = buffer.toString(); // not needed 147 148 // find next value for next time 149 150 for (i = current_length - 1; ; --i) { 151 if (i < 0) { 152 done = TRUE; 153 break; 154 } 155 current[i]++; 156 if (current[i] < pieces_lengths[i]) break; // got sequence 157 current[i] = 0; 158 } 159 return buffer; 160} 161 162/** 163 *@param set the source string to iterate against. This allows the same iterator to be used 164 * while changing the source string, saving object creation. 165 */ 166void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 167 int32_t list_length = 0; 168 UChar32 cp = 0; 169 int32_t start = 0; 170 int32_t i = 0; 171 UnicodeString *list = NULL; 172 173 nfd.normalize(newSource, source, status); 174 if(U_FAILURE(status)) { 175 return; 176 } 177 done = FALSE; 178 179 cleanPieces(); 180 181 // catch degenerate case 182 if (newSource.length() == 0) { 183 pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 184 pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 185 pieces_length = 1; 186 current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 187 current_length = 1; 188 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 189 status = U_MEMORY_ALLOCATION_ERROR; 190 goto CleanPartialInitialization; 191 } 192 current[0] = 0; 193 pieces[0] = new UnicodeString[1]; 194 pieces_lengths[0] = 1; 195 if (pieces[0] == 0) { 196 status = U_MEMORY_ALLOCATION_ERROR; 197 goto CleanPartialInitialization; 198 } 199 return; 200 } 201 202 203 list = new UnicodeString[source.length()]; 204 if (list == 0) { 205 status = U_MEMORY_ALLOCATION_ERROR; 206 goto CleanPartialInitialization; 207 } 208 209 // i should initialy be the number of code units at the 210 // start of the string 211 i = U16_LENGTH(source.char32At(0)); 212 //int32_t i = 1; 213 // find the segments 214 // This code iterates through the source string and 215 // extracts segments that end up on a codepoint that 216 // doesn't start any decompositions. (Analysis is done 217 // on the NFD form - see above). 218 for (; i < source.length(); i += U16_LENGTH(cp)) { 219 cp = source.char32At(i); 220 if (nfcImpl.isCanonSegmentStarter(cp)) { 221 source.extract(start, i-start, list[list_length++]); // add up to i 222 start = i; 223 } 224 } 225 source.extract(start, i-start, list[list_length++]); // add last one 226 227 228 // allocate the arrays, and find the strings that are CE to each segment 229 pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 230 pieces_length = list_length; 231 pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 232 current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 233 current_length = list_length; 234 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 235 status = U_MEMORY_ALLOCATION_ERROR; 236 goto CleanPartialInitialization; 237 } 238 239 for (i = 0; i < current_length; i++) { 240 current[i] = 0; 241 } 242 // for each segment, get all the combinations that can produce 243 // it after NFD normalization 244 for (i = 0; i < pieces_length; ++i) { 245 //if (PROGRESS) printf("SEGMENT\n"); 246 pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 247 } 248 249 delete[] list; 250 return; 251// Common section to cleanup all local variables and reset object variables. 252CleanPartialInitialization: 253 if (list != NULL) { 254 delete[] list; 255 } 256 cleanPieces(); 257} 258 259/** 260 * Dumb recursive implementation of permutation. 261 * TODO: optimize 262 * @param source the string to find permutations for 263 * @return the results in a set. 264 */ 265void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 266 if(U_FAILURE(status)) { 267 return; 268 } 269 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 270 int32_t i = 0; 271 272 // optimization: 273 // if zero or one character, just return a set with it 274 // we check for length < 2 to keep from counting code points all the time 275 if (source.length() <= 2 && source.countChar32() <= 1) { 276 UnicodeString *toPut = new UnicodeString(source); 277 /* test for NULL */ 278 if (toPut == 0) { 279 status = U_MEMORY_ALLOCATION_ERROR; 280 return; 281 } 282 result->put(source, toPut, status); 283 return; 284 } 285 286 // otherwise iterate through the string, and recursively permute all the other characters 287 UChar32 cp; 288 Hashtable subpermute(status); 289 if(U_FAILURE(status)) { 290 return; 291 } 292 subpermute.setValueDeleter(uprv_deleteUObject); 293 294 for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { 295 cp = source.char32At(i); 296 const UHashElement *ne = NULL; 297 int32_t el = -1; 298 UnicodeString subPermuteString = source; 299 300 // optimization: 301 // if the character is canonical combining class zero, 302 // don't permute it 303 if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 304 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 305 continue; 306 } 307 308 subpermute.removeAll(); 309 310 // see what the permutations of the characters before and after this one are 311 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 312 permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 313 /* Test for buffer overflows */ 314 if(U_FAILURE(status)) { 315 return; 316 } 317 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 318 // of source at this point. 319 320 // prefix this character to all of them 321 ne = subpermute.nextElement(el); 322 while (ne != NULL) { 323 UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 324 UnicodeString *chStr = new UnicodeString(cp); 325 //test for NULL 326 if (chStr == NULL) { 327 status = U_MEMORY_ALLOCATION_ERROR; 328 return; 329 } 330 chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 331 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 332 result->put(*chStr, chStr, status); 333 ne = subpermute.nextElement(el); 334 } 335 } 336 //return result; 337} 338 339// privates 340 341// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 342UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 343 Hashtable result(status); 344 Hashtable permutations(status); 345 Hashtable basic(status); 346 if (U_FAILURE(status)) { 347 return 0; 348 } 349 result.setValueDeleter(uprv_deleteUObject); 350 permutations.setValueDeleter(uprv_deleteUObject); 351 basic.setValueDeleter(uprv_deleteUObject); 352 353 UChar USeg[256]; 354 int32_t segLen = segment.extract(USeg, 256, status); 355 getEquivalents2(&basic, USeg, segLen, status); 356 357 // now get all the permutations 358 // add only the ones that are canonically equivalent 359 // TODO: optimize by not permuting any class zero. 360 361 const UHashElement *ne = NULL; 362 int32_t el = -1; 363 //Iterator it = basic.iterator(); 364 ne = basic.nextElement(el); 365 //while (it.hasNext()) 366 while (ne != NULL) { 367 //String item = (String) it.next(); 368 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 369 370 permutations.removeAll(); 371 permute(item, CANITER_SKIP_ZEROES, &permutations, status); 372 const UHashElement *ne2 = NULL; 373 int32_t el2 = -1; 374 //Iterator it2 = permutations.iterator(); 375 ne2 = permutations.nextElement(el2); 376 //while (it2.hasNext()) 377 while (ne2 != NULL) { 378 //String possible = (String) it2.next(); 379 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 380 UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 381 UnicodeString attempt; 382 nfd.normalize(possible, attempt, status); 383 384 // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 385 if (attempt==segment) { 386 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 387 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 388 result.put(possible, new UnicodeString(possible), status); //add(possible); 389 } else { 390 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 391 } 392 393 ne2 = permutations.nextElement(el2); 394 } 395 ne = basic.nextElement(el); 396 } 397 398 /* Test for buffer overflows */ 399 if(U_FAILURE(status)) { 400 return 0; 401 } 402 // convert into a String[] to clean up storage 403 //String[] finalResult = new String[result.size()]; 404 UnicodeString *finalResult = NULL; 405 int32_t resultCount; 406 if((resultCount = result.count())) { 407 finalResult = new UnicodeString[resultCount]; 408 if (finalResult == 0) { 409 status = U_MEMORY_ALLOCATION_ERROR; 410 return NULL; 411 } 412 } 413 else { 414 status = U_ILLEGAL_ARGUMENT_ERROR; 415 return NULL; 416 } 417 //result.toArray(finalResult); 418 result_len = 0; 419 el = -1; 420 ne = result.nextElement(el); 421 while(ne != NULL) { 422 finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 423 ne = result.nextElement(el); 424 } 425 426 427 return finalResult; 428} 429 430Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 431 432 if (U_FAILURE(status)) { 433 return NULL; 434 } 435 436 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 437 438 UnicodeString toPut(segment, segLen); 439 440 fillinResult->put(toPut, new UnicodeString(toPut), status); 441 442 UnicodeSet starts; 443 444 // cycle through all the characters 445 UChar32 cp; 446 for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { 447 // see if any character is at the start of some decomposition 448 U16_GET(segment, 0, i, segLen, cp); 449 if (!nfcImpl.getCanonStartSet(cp, starts)) { 450 continue; 451 } 452 // if so, see which decompositions match 453 UnicodeSetIterator iter(starts); 454 while (iter.next()) { 455 UChar32 cp2 = iter.getCodepoint(); 456 Hashtable remainder(status); 457 remainder.setValueDeleter(uprv_deleteUObject); 458 if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 459 continue; 460 } 461 462 // there were some matches, so add all the possibilities to the set. 463 UnicodeString prefix(segment, i); 464 prefix += cp2; 465 466 int32_t el = -1; 467 const UHashElement *ne = remainder.nextElement(el); 468 while (ne != NULL) { 469 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 470 UnicodeString *toAdd = new UnicodeString(prefix); 471 /* test for NULL */ 472 if (toAdd == 0) { 473 status = U_MEMORY_ALLOCATION_ERROR; 474 return NULL; 475 } 476 *toAdd += item; 477 fillinResult->put(*toAdd, toAdd, status); 478 479 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 480 481 ne = remainder.nextElement(el); 482 } 483 } 484 } 485 486 /* Test for buffer overflows */ 487 if(U_FAILURE(status)) { 488 return NULL; 489 } 490 return fillinResult; 491} 492 493/** 494 * See if the decomposition of cp2 is at segment starting at segmentPos 495 * (with canonical rearrangment!) 496 * If so, take the remainder, and return the equivalents 497 */ 498Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 499//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 500 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 501 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 502 503 if (U_FAILURE(status)) { 504 return NULL; 505 } 506 507 UnicodeString temp(comp); 508 int32_t inputLen=temp.length(); 509 UnicodeString decompString; 510 nfd.normalize(temp, decompString, status); 511 const UChar *decomp=decompString.getBuffer(); 512 int32_t decompLen=decompString.length(); 513 514 // See if it matches the start of segment (at segmentPos) 515 UBool ok = FALSE; 516 UChar32 cp; 517 int32_t decompPos = 0; 518 UChar32 decompCp; 519 U16_NEXT(decomp, decompPos, decompLen, decompCp); 520 521 int32_t i = segmentPos; 522 while(i < segLen) { 523 U16_NEXT(segment, i, segLen, cp); 524 525 if (cp == decompCp) { // if equal, eat another cp from decomp 526 527 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 528 529 if (decompPos == decompLen) { // done, have all decomp characters! 530 temp.append(segment+i, segLen-i); 531 ok = TRUE; 532 break; 533 } 534 U16_NEXT(decomp, decompPos, decompLen, decompCp); 535 } else { 536 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 537 538 // brute force approach 539 temp.append(cp); 540 541 /* TODO: optimize 542 // since we know that the classes are monotonically increasing, after zero 543 // e.g. 0 5 7 9 0 3 544 // we can do an optimization 545 // there are only a few cases that work: zero, less, same, greater 546 // if both classes are the same, we fail 547 // if the decomp class < the segment class, we fail 548 549 segClass = getClass(cp); 550 if (decompClass <= segClass) return null; 551 */ 552 } 553 } 554 if (!ok) 555 return NULL; // we failed, characters left over 556 557 //if (PROGRESS) printf("Matches\n"); 558 559 if (inputLen == temp.length()) { 560 fillinResult->put(UnicodeString(), new UnicodeString(), status); 561 return fillinResult; // succeed, but no remainder 562 } 563 564 // brute force approach 565 // check to make sure result is canonically equivalent 566 UnicodeString trial; 567 nfd.normalize(temp, trial, status); 568 if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 569 return NULL; 570 } 571 572 return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 573} 574 575U_NAMESPACE_END 576 577#endif /* #if !UCONFIG_NO_NORMALIZATION */ 578