1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ***************************************************************************** 383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Copyright (C) 1996-2011, International Business Machines Corporation and * 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * others. All Rights Reserved. * 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ***************************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/caniter.h" 1327f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/normalizer2.h" 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 1527f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/uniset.h" 1627f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/usetiter.h" 1727f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/ustring.h" 1883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 2027f654740f2a26ad62a5c155af9199af9e69b889claireho#include "hash.h" 2127f654740f2a26ad62a5c155af9199af9e69b889claireho#include "normalizer2impl.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * This class allows one to iterate through all the strings that are canonically equivalent to a given 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * string. For example, here are some sample results: 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruResults for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru1: \u0041\u030A\u0064\u0307\u0327 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru2: \u0041\u030A\u0064\u0327\u0307 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru3: \u0041\u030A\u1E0B\u0327 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru4: \u0041\u030A\u1E11\u0307 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru5: \u00C5\u0064\u0307\u0327 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru6: \u00C5\u0064\u0327\u0307 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru7: \u00C5\u1E0B\u0327 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru8: \u00C5\u1E11\u0307 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru9: \u212B\u0064\u0307\u0327 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru10: \u212B\u0064\u0327\u0307 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru11: \u212B\u1E0B\u0327 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru12: \u212B\u1E11\u0307 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * since it has not been optimized for that situation. 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@author M. Davis 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@draft 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// public 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TODO: add boilerplate methods. 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@param source string to get results for 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces(NULL), 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length(0), 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths(NULL), 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current(NULL), 7350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho current_length(0), 7427f654740f2a26ad62a5c155af9199af9e69b889claireho nfd(*Normalizer2Factory::getNFDInstance(status)), 7527f654740f2a26ad62a5c155af9199af9e69b889claireho nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 7727f654740f2a26ad62a5c155af9199af9e69b889claireho if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setSource(sourceStr, status); 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCanonicalIterator::~CanonicalIterator() { 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::cleanPieces() { 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces != NULL) { 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(i = 0; i < pieces_length; i++) { 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces[i] != NULL) { 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] pieces[i]; 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(pieces); 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = NULL; 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = 0; 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces_lengths != NULL) { 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(pieces_lengths); 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = NULL; 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(current != NULL) { 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(current); 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = NULL; 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = 0; 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@return gets the source: NOTE: it is the NFD form of source 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString CanonicalIterator::getSource() { 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return source; 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Resets the iterator so that one can start again from the beginning. 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::reset() { 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = FALSE; 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i < current_length; ++i) { 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@return the next string that is canonically equivalent. The value null is returned when 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the iteration is done. 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString CanonicalIterator::next() { 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (done) { 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.setToBogus(); 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return buffer; 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // delete old contents 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.remove(); 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // construct return value 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < pieces_length; ++i) { 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.append(pieces[i][current[i]]); 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String result = buffer.toString(); // not needed 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // find next value for next time 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = current_length - 1; ; --i) { 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i < 0) { 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = TRUE; 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i]++; 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (current[i] < pieces_lengths[i]) break; // got sequence 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return buffer; 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@param set the source string to iterate against. This allows the same iterator to be used 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * while changing the source string, saving object creation. 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t list_length = 0; 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp = 0; 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = 0; 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *list = NULL; 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17327f654740f2a26ad62a5c155af9199af9e69b889claireho nfd.normalize(newSource, source, status); 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = FALSE; 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // catch degenerate case 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (newSource.length() == 0) { 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = 1; 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = 1; 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[0] = 0; 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces[0] = new UnicodeString[1]; 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths[0] = 1; 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces[0] == 0) { 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru list = new UnicodeString[source.length()]; 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (list == 0) { 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // i should initialy be the number of code units at the 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start of the string 21183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius i = U16_LENGTH(source.char32At(0)); 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //int32_t i = 1; 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // find the segments 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This code iterates through the source string and 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extracts segments that end up on a codepoint that 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // doesn't start any decompositions. (Analysis is done 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // on the NFD form - see above). 21883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius for (; i < source.length(); i += U16_LENGTH(cp)) { 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cp = source.char32At(i); 22027f654740f2a26ad62a5c155af9199af9e69b889claireho if (nfcImpl.isCanonSegmentStarter(cp)) { 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru source.extract(start, i-start, list[list_length++]); // add up to i 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = i; 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru source.extract(start, i-start, list[list_length++]); // add last one 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // allocate the arrays, and find the strings that are CE to each segment 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = list_length; 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = list_length; 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < current_length; i++) { 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for each segment, get all the combinations that can produce 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it after NFD normalization 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < pieces_length; ++i) { 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("SEGMENT\n"); 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] list; 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Common section to cleanup all local variables and reset object variables. 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCleanPartialInitialization: 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (list != NULL) { 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] list; 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Dumb recursive implementation of permutation. 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * TODO: optimize 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param source the string to find permutations for 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return the results in a set. 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // optimization: 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if zero or one character, just return a set with it 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we check for length < 2 to keep from counting code points all the time 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source.length() <= 2 && source.countChar32() <= 1) { 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *toPut = new UnicodeString(source); 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* test for NULL */ 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (toPut == 0) { 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->put(source, toPut, status); 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // otherwise iterate through the string, and recursively permute all the other characters 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp; 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable subpermute(status); 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 29283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius subpermute.setValueDeleter(uprv_deleteUObject); 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 29483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cp = source.char32At(i); 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = NULL; 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString subPermuteString = source; 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // optimization: 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the character is canonical combining class zero, 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't permute it 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru subpermute.removeAll(); 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // see what the permutations of the characters before and after this one are 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 31283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of source at this point. 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // prefix this character to all of them 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = subpermute.nextElement(el); 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *chStr = new UnicodeString(cp); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //test for NULL 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (chStr == NULL) { 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->put(*chStr, chStr, status); 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = subpermute.nextElement(el); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //return result; 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// privates 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable result(status); 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable permutations(status); 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable basic(status); 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 34983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius result.setValueDeleter(uprv_deleteUObject); 35083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius permutations.setValueDeleter(uprv_deleteUObject); 35183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius basic.setValueDeleter(uprv_deleteUObject); 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar USeg[256]; 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t segLen = segment.extract(USeg, 256, status); 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getEquivalents2(&basic, USeg, segLen, status); 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // now get all the permutations 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // add only the ones that are canonically equivalent 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: optimize by not permuting any class zero. 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = NULL; 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Iterator it = basic.iterator(); 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = basic.nextElement(el); 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //while (it.hasNext()) 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String item = (String) it.next(); 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permutations.removeAll(); 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permute(item, CANITER_SKIP_ZEROES, &permutations, status); 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne2 = NULL; 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el2 = -1; 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Iterator it2 = permutations.iterator(); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne2 = permutations.nextElement(el2); 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //while (it2.hasNext()) 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne2 != NULL) { 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String possible = (String) it2.next(); 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attempt; 38227f654740f2a26ad62a5c155af9199af9e69b889claireho nfd.normalize(possible, attempt, status); 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (attempt==segment) { 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result.put(possible, new UnicodeString(possible), status); //add(possible); 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne2 = permutations.nextElement(el2); 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = basic.nextElement(el); 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // convert into a String[] to clean up storage 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String[] finalResult = new String[result.size()]; 404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *finalResult = NULL; 405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t resultCount; 406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if((resultCount = result.count())) { 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru finalResult = new UnicodeString[resultCount]; 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (finalResult == 0) { 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ILLEGAL_ARGUMENT_ERROR; 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //result.toArray(finalResult); 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result_len = 0; 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el = -1; 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = result.nextElement(el); 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(ne != NULL) { 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = result.nextElement(el); 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return finalResult; 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruHashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString toPut(segment, segLen); 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(toPut, new UnicodeString(toPut), status); 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 44227f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeSet starts; 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // cycle through all the characters 44527f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 cp; 44683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // see if any character is at the start of some decomposition 44883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_GET(segment, 0, i, segLen, cp); 44927f654740f2a26ad62a5c155af9199af9e69b889claireho if (!nfcImpl.getCanonStartSet(cp, starts)) { 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 45227f654740f2a26ad62a5c155af9199af9e69b889claireho // if so, see which decompositions match 45327f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeSetIterator iter(starts); 45427f654740f2a26ad62a5c155af9199af9e69b889claireho while (iter.next()) { 45527f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 cp2 = iter.getCodepoint(); 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable remainder(status); 45783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius remainder.setValueDeleter(uprv_deleteUObject); 45827f654740f2a26ad62a5c155af9199af9e69b889claireho if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there were some matches, so add all the possibilities to the set. 463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString prefix(segment, i); 46427f654740f2a26ad62a5c155af9199af9e69b889claireho prefix += cp2; 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = remainder.nextElement(el); 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *toAdd = new UnicodeString(prefix); 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* test for NULL */ 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (toAdd == 0) { 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *toAdd += item; 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(*toAdd, toAdd, status); 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = remainder.nextElement(el); 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fillinResult; 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * See if the decomposition of cp2 is at segment starting at segmentPos 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * (with canonical rearrangment!) 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If so, take the remainder, and return the equivalents 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruHashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 50750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString temp(comp); 50850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t inputLen=temp.length(); 50950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString decompString; 51050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nfd.normalize(temp, decompString, status); 51150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *decomp=decompString.getBuffer(); 51250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t decompLen=decompString.length(); 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See if it matches the start of segment (at segmentPos) 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool ok = FALSE; 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp; 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t decompPos = 0; 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 decompCp; 51950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(decomp, decompPos, decompLen, decompCp); 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 52150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i = segmentPos; 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(i < segLen) { 52350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(segment, i, segLen, cp); 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cp == decompCp) { // if equal, eat another cp from decomp 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (decompPos == decompLen) { // done, have all decomp characters! 53050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho temp.append(segment+i, segLen-i); 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ok = TRUE; 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 53450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(decomp, decompPos, decompLen, decompCp); 535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // brute force approach 53950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho temp.append(cp); 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* TODO: optimize 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since we know that the classes are monotonically increasing, after zero 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // e.g. 0 5 7 9 0 3 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we can do an optimization 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there are only a few cases that work: zero, less, same, greater 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if both classes are the same, we fail 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the decomp class < the segment class, we fail 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru segClass = getClass(cp); 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (decompClass <= segClass) return null; 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ok) 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; // we failed, characters left over 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Matches\n"); 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 55950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (inputLen == temp.length()) { 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(UnicodeString(), new UnicodeString(), status); 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fillinResult; // succeed, but no remainder 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // brute force approach 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check to make sure result is canonically equivalent 56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString trial; 56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nfd.normalize(temp, trial, status); 56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_NORMALIZATION */ 578