16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ***************************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 1996-2011, International Business Machines Corporation and * 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * others. All Rights Reserved. * 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ***************************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/caniter.h" 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normalizer2.h" 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uchar.h" 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/usetiter.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ustring.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "hash.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "normalizer2impl.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This class allows one to iterate through all the strings that are canonically equivalent to a given 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * string. For example, here are some sample results: 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgResults for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org1: \u0041\u030A\u0064\u0307\u0327 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org2: \u0041\u030A\u0064\u0327\u0307 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org3: \u0041\u030A\u1E0B\u0327 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org4: \u0041\u030A\u1E11\u0307 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org5: \u00C5\u0064\u0307\u0327 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org6: \u00C5\u0064\u0327\u0307 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org7: \u00C5\u1E0B\u0327 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org8: \u00C5\u1E11\u0307 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org9: \u212B\u0064\u0307\u0327 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org10: \u212B\u0064\u0327\u0307 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org11: \u212B\u1E0B\u0327 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org12: \u212B\u1E11\u0307 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * since it has not been optimized for that situation. 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@author M. Davis 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@draft 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// public 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TODO: add boilerplate methods. 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@param source string to get results for 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces(NULL), 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_length(0), 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_lengths(NULL), 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current(NULL), 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current_length(0), 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfd(*Normalizer2Factory::getNFDInstance(status)), 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setSource(sourceStr, status); 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCanonicalIterator::~CanonicalIterator() { 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cleanPieces(); 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CanonicalIterator::cleanPieces() { 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = 0; 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pieces != NULL) { 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i = 0; i < pieces_length; i++) { 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pieces[i] != NULL) { 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] pieces[i]; 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(pieces); 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces = NULL; 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_length = 0; 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pieces_lengths != NULL) { 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(pieces_lengths); 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_lengths = NULL; 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(current != NULL) { 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(current); 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current = NULL; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current_length = 0; 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@return gets the source: NOTE: it is the NFD form of source 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeString CanonicalIterator::getSource() { 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return source; 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Resets the iterator so that one can start again from the beginning. 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CanonicalIterator::reset() { 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org done = FALSE; 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int i = 0; i < current_length; ++i) { 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current[i] = 0; 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@return the next string that is canonically equivalent. The value null is returned when 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the iteration is done. 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeString CanonicalIterator::next() { 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = 0; 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (done) { 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org buffer.setToBogus(); 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer; 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // delete old contents 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org buffer.remove(); 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // construct return value 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i = 0; i < pieces_length; ++i) { 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org buffer.append(pieces[i][current[i]]); 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //String result = buffer.toString(); // not needed 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // find next value for next time 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i = current_length - 1; ; --i) { 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (i < 0) { 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org done = TRUE; 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current[i]++; 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (current[i] < pieces_lengths[i]) break; // got sequence 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current[i] = 0; 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer; 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *@param set the source string to iterate against. This allows the same iterator to be used 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * while changing the source string, saving object creation. 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t list_length = 0; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp = 0; 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start = 0; 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = 0; 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *list = NULL; 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfd.normalize(newSource, source, status); 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org done = FALSE; 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cleanPieces(); 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // catch degenerate case 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (newSource.length() == 0) { 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_length = 1; 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current_length = 1; 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto CleanPartialInitialization; 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current[0] = 0; 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces[0] = new UnicodeString[1]; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_lengths[0] = 1; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pieces[0] == 0) { 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto CleanPartialInitialization; 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list = new UnicodeString[source.length()]; 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (list == 0) { 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto CleanPartialInitialization; 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // i should initialy be the number of code units at the 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // start of the string 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i = U16_LENGTH(source.char32At(0)); 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //int32_t i = 1; 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // find the segments 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // This code iterates through the source string and 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // extracts segments that end up on a codepoint that 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // doesn't start any decompositions. (Analysis is done 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // on the NFD form - see above). 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (; i < source.length(); i += U16_LENGTH(cp)) { 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cp = source.char32At(i); 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (nfcImpl.isCanonSegmentStarter(cp)) { 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org source.extract(start, i-start, list[list_length++]); // add up to i 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start = i; 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org source.extract(start, i-start, list[list_length++]); // add last one 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // allocate the arrays, and find the strings that are CE to each segment 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_length = list_length; 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current_length = list_length; 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto CleanPartialInitialization; 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i = 0; i < current_length; i++) { 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current[i] = 0; 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // for each segment, get all the combinations that can produce 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // it after NFD normalization 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i = 0; i < pieces_length; ++i) { 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("SEGMENT\n"); 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] list; 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Common section to cleanup all local variables and reset object variables. 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCleanPartialInitialization: 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (list != NULL) { 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] list; 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cleanPieces(); 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Dumb recursive implementation of permutation. 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * TODO: optimize 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param source the string to find permutations for 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the results in a set. 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = 0; 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // optimization: 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if zero or one character, just return a set with it 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // we check for length < 2 to keep from counting code points all the time 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (source.length() <= 2 && source.countChar32() <= 1) { 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *toPut = new UnicodeString(source); 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (toPut == 0) { 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result->put(source, toPut, status); 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // otherwise iterate through the string, and recursively permute all the other characters 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp; 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Hashtable subpermute(status); 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org subpermute.setValueDeleter(uprv_deleteUObject); 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cp = source.char32At(i); 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UHashElement *ne = NULL; 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t el = -1; 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString subPermuteString = source; 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // optimization: 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if the character is canonical combining class zero, 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // don't permute it 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org subpermute.removeAll(); 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // see what the permutations of the characters before and after this one are 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Test for buffer overflows */ 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // of source at this point. 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // prefix this character to all of them 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = subpermute.nextElement(el); 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (ne != NULL) { 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *chStr = new UnicodeString(cp); 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //test for NULL 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (chStr == NULL) { 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result->put(*chStr, chStr, status); 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = subpermute.nextElement(el); 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //return result; 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// privates 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Hashtable result(status); 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Hashtable permutations(status); 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Hashtable basic(status); 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result.setValueDeleter(uprv_deleteUObject); 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org permutations.setValueDeleter(uprv_deleteUObject); 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org basic.setValueDeleter(uprv_deleteUObject); 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar USeg[256]; 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t segLen = segment.extract(USeg, 256, status); 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org getEquivalents2(&basic, USeg, segLen, status); 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // now get all the permutations 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // add only the ones that are canonically equivalent 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: optimize by not permuting any class zero. 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UHashElement *ne = NULL; 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t el = -1; 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //Iterator it = basic.iterator(); 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = basic.nextElement(el); 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //while (it.hasNext()) 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (ne != NULL) { 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //String item = (String) it.next(); 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org permutations.removeAll(); 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org permute(item, CANITER_SKIP_ZEROES, &permutations, status); 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UHashElement *ne2 = NULL; 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t el2 = -1; 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //Iterator it2 = permutations.iterator(); 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne2 = permutations.nextElement(el2); 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //while (it2.hasNext()) 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (ne2 != NULL) { 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //String possible = (String) it2.next(); 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString attempt; 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfd.normalize(possible, attempt, status); 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (attempt==segment) { 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result.put(possible, new UnicodeString(possible), status); //add(possible); 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne2 = permutations.nextElement(el2); 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = basic.nextElement(el); 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Test for buffer overflows */ 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // convert into a String[] to clean up storage 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //String[] finalResult = new String[result.size()]; 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *finalResult = NULL; 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t resultCount; 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((resultCount = result.count())) { 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org finalResult = new UnicodeString[resultCount]; 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (finalResult == 0) { 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ILLEGAL_ARGUMENT_ERROR; 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //result.toArray(finalResult); 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result_len = 0; 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org el = -1; 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = result.nextElement(el); 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(ne != NULL) { 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = result.nextElement(el); 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return finalResult; 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgHashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString toPut(segment, segLen); 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fillinResult->put(toPut, new UnicodeString(toPut), status); 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet starts; 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // cycle through all the characters 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp; 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // see if any character is at the start of some decomposition 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_GET(segment, 0, i, segLen, cp); 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!nfcImpl.getCanonStartSet(cp, starts)) { 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if so, see which decompositions match 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSetIterator iter(starts); 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (iter.next()) { 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp2 = iter.getCodepoint(); 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Hashtable remainder(status); 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org remainder.setValueDeleter(uprv_deleteUObject); 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // there were some matches, so add all the possibilities to the set. 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString prefix(segment, i); 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prefix += cp2; 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t el = -1; 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UHashElement *ne = remainder.nextElement(el); 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (ne != NULL) { 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *toAdd = new UnicodeString(prefix); 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (toAdd == 0) { 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *toAdd += item; 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fillinResult->put(*toAdd, toAdd, status); 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ne = remainder.nextElement(el); 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Test for buffer overflows */ 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status)) { 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return fillinResult; 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * See if the decomposition of cp2 is at segment starting at segmentPos 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (with canonical rearrangment!) 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * If so, take the remainder, and return the equivalents 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgHashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString temp(comp); 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inputLen=temp.length(); 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString decompString; 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfd.normalize(temp, decompString, status); 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *decomp=decompString.getBuffer(); 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t decompLen=decompString.length(); 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // See if it matches the start of segment (at segmentPos) 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool ok = FALSE; 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp; 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t decompPos = 0; 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 decompCp; 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_NEXT(decomp, decompPos, decompLen, decompCp); 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = segmentPos; 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(i < segLen) { 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_NEXT(segment, i, segLen, cp); 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (cp == decompCp) { // if equal, eat another cp from decomp 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (decompPos == decompLen) { // done, have all decomp characters! 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org temp.append(segment+i, segLen-i); 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ok = TRUE; 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_NEXT(decomp, decompPos, decompLen, decompCp); 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // brute force approach 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org temp.append(cp); 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* TODO: optimize 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // since we know that the classes are monotonically increasing, after zero 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // e.g. 0 5 7 9 0 3 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // we can do an optimization 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // there are only a few cases that work: zero, less, same, greater 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if both classes are the same, we fail 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if the decomp class < the segment class, we fail 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segClass = getClass(cp); 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (decompClass <= segClass) return null; 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!ok) 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; // we failed, characters left over 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //if (PROGRESS) printf("Matches\n"); 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (inputLen == temp.length()) { 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fillinResult->put(UnicodeString(), new UnicodeString(), status); 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return fillinResult; // succeed, but no remainder 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // brute force approach 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // check to make sure result is canonically equivalent 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString trial; 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org nfd.normalize(temp, trial, status); 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_NORMALIZATION */ 578