1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ***************************************************************************** 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Copyright (C) 1996-2010, International Business Machines Corporation and * 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * others. All Rights Reserved. * 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ***************************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/caniter.h" 1327f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/normalizer2.h" 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 1527f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/uniset.h" 1627f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/usetiter.h" 1727f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/ustring.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 1927f654740f2a26ad62a5c155af9199af9e69b889claireho#include "hash.h" 2027f654740f2a26ad62a5c155af9199af9e69b889claireho#include "normalizer2impl.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * This class allows one to iterate through all the strings that are canonically equivalent to a given 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * string. For example, here are some sample results: 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruResults for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru1: \u0041\u030A\u0064\u0307\u0327 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru2: \u0041\u030A\u0064\u0327\u0307 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru3: \u0041\u030A\u1E0B\u0327 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru4: \u0041\u030A\u1E11\u0307 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru5: \u00C5\u0064\u0307\u0327 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru6: \u00C5\u0064\u0327\u0307 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru7: \u00C5\u1E0B\u0327 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru8: \u00C5\u1E11\u0307 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru9: \u212B\u0064\u0307\u0327 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru10: \u212B\u0064\u0327\u0307 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru11: \u212B\u1E0B\u0327 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru12: \u212B\u1E11\u0307 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * since it has not been optimized for that situation. 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@author M. Davis 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@draft 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// public 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TODO: add boilerplate methods. 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@param source string to get results for 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces(NULL), 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length(0), 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths(NULL), 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current(NULL), 7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho current_length(0), 7327f654740f2a26ad62a5c155af9199af9e69b889claireho nfd(*Normalizer2Factory::getNFDInstance(status)), 7427f654740f2a26ad62a5c155af9199af9e69b889claireho nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 7627f654740f2a26ad62a5c155af9199af9e69b889claireho if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setSource(sourceStr, status); 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCanonicalIterator::~CanonicalIterator() { 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::cleanPieces() { 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces != NULL) { 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(i = 0; i < pieces_length; i++) { 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces[i] != NULL) { 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] pieces[i]; 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(pieces); 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = NULL; 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = 0; 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(pieces_lengths != NULL) { 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(pieces_lengths); 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = NULL; 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(current != NULL) { 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(current); 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = NULL; 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = 0; 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@return gets the source: NOTE: it is the NFD form of source 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString CanonicalIterator::getSource() { 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return source; 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Resets the iterator so that one can start again from the beginning. 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::reset() { 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = FALSE; 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i < current_length; ++i) { 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@return the next string that is canonically equivalent. The value null is returned when 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the iteration is done. 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString CanonicalIterator::next() { 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (done) { 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.setToBogus(); 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return buffer; 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // delete old contents 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.remove(); 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // construct return value 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < pieces_length; ++i) { 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buffer.append(pieces[i][current[i]]); 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String result = buffer.toString(); // not needed 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // find next value for next time 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = current_length - 1; ; --i) { 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i < 0) { 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = TRUE; 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i]++; 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (current[i] < pieces_lengths[i]) break; // got sequence 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return buffer; 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *@param set the source string to iterate against. This allows the same iterator to be used 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * while changing the source string, saving object creation. 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t list_length = 0; 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp = 0; 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = 0; 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *list = NULL; 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17227f654740f2a26ad62a5c155af9199af9e69b889claireho nfd.normalize(newSource, source, status); 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru done = FALSE; 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // catch degenerate case 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (newSource.length() == 0) { 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = 1; 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = 1; 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[0] = 0; 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces[0] = new UnicodeString[1]; 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths[0] = 1; 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces[0] == 0) { 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru list = new UnicodeString[source.length()]; 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (list == 0) { 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // i should initialy be the number of code units at the 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start of the string 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru i = UTF16_CHAR_LENGTH(source.char32At(0)); 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //int32_t i = 1; 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // find the segments 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This code iterates through the source string and 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extracts segments that end up on a codepoint that 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // doesn't start any decompositions. (Analysis is done 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // on the NFD form - see above). 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cp = source.char32At(i); 21927f654740f2a26ad62a5c155af9199af9e69b889claireho if (nfcImpl.isCanonSegmentStarter(cp)) { 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru source.extract(start, i-start, list[list_length++]); // add up to i 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = i; 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru source.extract(start, i-start, list[list_length++]); // add last one 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // allocate the arrays, and find the strings that are CE to each segment 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_length = list_length; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current_length = list_length; 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto CleanPartialInitialization; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < current_length; i++) { 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru current[i] = 0; 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for each segment, get all the combinations that can produce 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it after NFD normalization 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < pieces_length; ++i) { 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("SEGMENT\n"); 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] list; 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Common section to cleanup all local variables and reset object variables. 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCleanPartialInitialization: 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (list != NULL) { 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] list; 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanPieces(); 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Dumb recursive implementation of permutation. 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * TODO: optimize 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param source the string to find permutations for 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return the results in a set. 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // optimization: 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if zero or one character, just return a set with it 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we check for length < 2 to keep from counting code points all the time 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source.length() <= 2 && source.countChar32() <= 1) { 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *toPut = new UnicodeString(source); 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* test for NULL */ 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (toPut == 0) { 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->put(source, toPut, status); 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // otherwise iterate through the string, and recursively permute all the other characters 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp; 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable subpermute(status); 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru subpermute.setValueDeleter(uhash_deleteUnicodeString); 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cp = source.char32At(i); 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = NULL; 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString subPermuteString = source; 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // optimization: 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the character is canonical combining class zero, 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't permute it 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru subpermute.removeAll(); 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // see what the permutations of the characters before and after this one are 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of source at this point. 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // prefix this character to all of them 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = subpermute.nextElement(el); 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *chStr = new UnicodeString(cp); 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //test for NULL 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (chStr == NULL) { 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->put(*chStr, chStr, status); 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = subpermute.nextElement(el); 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //return result; 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// privates 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable result(status); 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable permutations(status); 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable basic(status); 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result.setValueDeleter(uhash_deleteUnicodeString); 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permutations.setValueDeleter(uhash_deleteUnicodeString); 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru basic.setValueDeleter(uhash_deleteUnicodeString); 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar USeg[256]; 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t segLen = segment.extract(USeg, 256, status); 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getEquivalents2(&basic, USeg, segLen, status); 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // now get all the permutations 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // add only the ones that are canonically equivalent 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: optimize by not permuting any class zero. 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = NULL; 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Iterator it = basic.iterator(); 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = basic.nextElement(el); 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //while (it.hasNext()) 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String item = (String) it.next(); 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permutations.removeAll(); 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru permute(item, CANITER_SKIP_ZEROES, &permutations, status); 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne2 = NULL; 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el2 = -1; 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Iterator it2 = permutations.iterator(); 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne2 = permutations.nextElement(el2); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //while (it2.hasNext()) 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne2 != NULL) { 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String possible = (String) it2.next(); 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString attempt; 38127f654740f2a26ad62a5c155af9199af9e69b889claireho nfd.normalize(possible, attempt, status); 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (attempt==segment) { 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result.put(possible, new UnicodeString(possible), status); //add(possible); 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne2 = permutations.nextElement(el2); 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = basic.nextElement(el); 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // convert into a String[] to clean up storage 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //String[] finalResult = new String[result.size()]; 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *finalResult = NULL; 404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t resultCount; 405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if((resultCount = result.count())) { 406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru finalResult = new UnicodeString[resultCount]; 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (finalResult == 0) { 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ILLEGAL_ARGUMENT_ERROR; 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //result.toArray(finalResult); 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result_len = 0; 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru el = -1; 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = result.nextElement(el); 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(ne != NULL) { 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = result.nextElement(el); 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return finalResult; 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruHashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString toPut(segment, segLen); 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(toPut, new UnicodeString(toPut), status); 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 44127f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeSet starts; 442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // cycle through all the characters 44427f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 cp; 44527f654740f2a26ad62a5c155af9199af9e69b889claireho for (int32_t i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) { 446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // see if any character is at the start of some decomposition 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_GET_CHAR(segment, 0, i, segLen, cp); 44827f654740f2a26ad62a5c155af9199af9e69b889claireho if (!nfcImpl.getCanonStartSet(cp, starts)) { 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 45127f654740f2a26ad62a5c155af9199af9e69b889claireho // if so, see which decompositions match 45227f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeSetIterator iter(starts); 45327f654740f2a26ad62a5c155af9199af9e69b889claireho while (iter.next()) { 45427f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 cp2 = iter.getCodepoint(); 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Hashtable remainder(status); 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru remainder.setValueDeleter(uhash_deleteUnicodeString); 45727f654740f2a26ad62a5c155af9199af9e69b889claireho if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there were some matches, so add all the possibilities to the set. 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString prefix(segment, i); 46327f654740f2a26ad62a5c155af9199af9e69b889claireho prefix += cp2; 464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t el = -1; 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UHashElement *ne = remainder.nextElement(el); 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ne != NULL) { 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *toAdd = new UnicodeString(prefix); 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* test for NULL */ 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (toAdd == 0) { 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *toAdd += item; 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(*toAdd, toAdd, status); 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ne = remainder.nextElement(el); 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Test for buffer overflows */ 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fillinResult; 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * See if the decomposition of cp2 is at segment starting at segmentPos 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * (with canonical rearrangment!) 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If so, take the remainder, and return the equivalents 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruHashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 50650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString temp(comp); 50750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t inputLen=temp.length(); 50850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString decompString; 50950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nfd.normalize(temp, decompString, status); 51050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *decomp=decompString.getBuffer(); 51150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t decompLen=decompString.length(); 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See if it matches the start of segment (at segmentPos) 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool ok = FALSE; 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp; 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t decompPos = 0; 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 decompCp; 51850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(decomp, decompPos, decompLen, decompCp); 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 52050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i = segmentPos; 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while(i < segLen) { 52250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(segment, i, segLen, cp); 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cp == decompCp) { // if equal, eat another cp from decomp 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (decompPos == decompLen) { // done, have all decomp characters! 52950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho temp.append(segment+i, segLen-i); 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ok = TRUE; 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 53350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(decomp, decompPos, decompLen, decompCp); 534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // brute force approach 53850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho temp.append(cp); 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* TODO: optimize 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since we know that the classes are monotonically increasing, after zero 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // e.g. 0 5 7 9 0 3 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we can do an optimization 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there are only a few cases that work: zero, less, same, greater 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if both classes are the same, we fail 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the decomp class < the segment class, we fail 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru segClass = getClass(cp); 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (decompClass <= segClass) return null; 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ok) 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; // we failed, characters left over 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //if (PROGRESS) printf("Matches\n"); 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 55850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (inputLen == temp.length()) { 559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fillinResult->put(UnicodeString(), new UnicodeString(), status); 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fillinResult; // succeed, but no remainder 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // brute force approach 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check to make sure result is canonically equivalent 56550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString trial; 56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nfd.normalize(temp, trial, status); 56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_NORMALIZATION */ 577