1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Copyright (C) 2001-2011 IBM and others. All rights reserved. 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 07/02/2001 synwee Creation. 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/usearch.h" 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ustring.h" 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "normalizer2impl.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucol_imp.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "usrchimp.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucln_in.h" 22c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uassert.h" 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "ustr_imp.h" 24c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 25c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_USE 26c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 27c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// don't use Boyer-Moore 2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 29c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#define BOYER_MOORE 0 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal definition --------------------------------------------------- 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define LAST_BYTE_MASK_ 0xFF 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define SECOND_LAST_BYTE_SHIFT_ 8 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define SUPPLEMENTARY_MIN_VALUE_ 0x10000 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic const uint16_t *fcdTrieIndex = NULL; 40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic UChar32 fcdHighStart = 0; 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal methods ------------------------------------------------- 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Fast collation element iterator setOffset. 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This function does not check for bounds. 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param offset to set 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 50b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline void setColEIterOffset(UCollationElements *elems, 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset) 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru collIterate *ci = &(elems->iteratordata_); 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ci->pos = ci->string + offset; 56c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ci->flags & UCOL_ITER_INNORMBUF) { 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ci->flags = ci->origFlags; 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ci->fcdPosition = NULL; 61c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ci->offsetReturn = NULL; 63c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ci->offsetStore = ci->offsetBuffer; 64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Getting the mask for collation strength 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strength collation strength 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return collation element mask 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 73b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint32_t getMask(UCollationStrength strength) 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 75b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru switch (strength) 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case UCOL_PRIMARY: 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return UCOL_PRIMARYORDERMASK; 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case UCOL_SECONDARY: 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 82b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_PRIMARYORDERMASK; 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This is to squeeze the 21bit ces into a 256 table 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param ce collation element 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return collapsed version of the collation element 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 93b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int hash(uint32_t ce) 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // well with the new collation where most of the latin 1 characters 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are of the value xx000xxx. their hashes will most of the time be 0 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to be discussed on the hash algo. 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_BEGIN 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UBool U_CALLCONV 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruusearch_cleanup(void) { 105b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru fcdTrieIndex = NULL; 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_END 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing the fcd tables. 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 113b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 117b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void initializeFCD(UErrorCode *status) 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 119b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (fcdTrieIndex == NULL) { 120b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the fcd value for a character at the argument index. 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This method takes into accounts of the supplementary characters. 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param str UTF16 string where character for fcd retrieval resides 129b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param offset position of the character whose fcd is to be retrieved, to be 130b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* overwritten with the next character position, taking 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* surrogate characters into consideration. 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strlength length of the argument string 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return fcd value 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 136b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruuint16_t getFCD(const UChar *str, int32_t *offset, 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t strlength) 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 139b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *temp = str + *offset; 140b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength); 141b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *offset = (int32_t)(temp - str); 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the modified collation elements taking into account the collation 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* attributes 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 149b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param sourcece 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the modified collation element 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note for tertiary we can't use the collator->tertiaryMask, that 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is a preprocessed mask that takes into account case options. since 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only concerned with exact matches, we don't need that. 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece &= strsrch->ceMask; 159b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->toShift) { 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alternate handling here, since only the 16 most significant digits 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is only used, we can safely do a compare without masking 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the ce is a variable, we mask and get only the primary values 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // no shifting to quartenary is required since all primary values 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // less than variabletop will need to be masked off anyway. 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->variableTop > sourcece) { 167b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch->strength >= UCOL_QUATERNARY) { 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece &= UCOL_PRIMARYORDERMASK; 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 170b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru else { 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece = UCOL_IGNORABLE; 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 175b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru sourcece = 0xFFFF; 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return sourcece; 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 181b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/** 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Allocate a memory and returns NULL if it failed. 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param size to allocate 185b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return newly allocated array, NULL otherwise 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 190b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void * allocateMemory(uint32_t size, UErrorCode *status) 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t *result = (uint32_t *)uprv_malloc(size); 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL) { 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Adds a uint32_t value to a destination array. 201b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* manually deallocate the newly allocated array. 203b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 204b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size destinationlength. 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destination target array 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param offset destination offset to add value 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destinationlength target array size, return value for the new size 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param value to be added 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param increments incremental size expected 211b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 216b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t * addTouint32_tArray(int32_t *destination, 217b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t offset, 218b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t *destinationlength, 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t value, 220b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t increments, 221b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t newlength = *destinationlength; 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset + 1 == newlength) { 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru newlength += increments; 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *temp = (int32_t *)allocateMemory( 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(int32_t) * newlength, status); 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = newlength; 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination = temp; 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination[offset] = value; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return destination; 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 240c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Adds a uint64_t value to a destination array. 241b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* manually deallocate the newly allocated array. 243b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 244b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 245c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* size destinationlength. 246c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param destination target array 247c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param offset destination offset to add value 248c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param destinationlength target array size, return value for the new size 249c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param value to be added 250c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param increments incremental size expected 251b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 252c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* method, status assumed to be success when passed in. 253c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 254c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/ 255c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic 256b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int64_t * addTouint64_tArray(int64_t *destination, 257b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t offset, 258b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t *destinationlength, 259c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint64_t value, 260b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t increments, 261b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 262c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t newlength = *destinationlength; 264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset + 1 == newlength) { 265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru newlength += increments; 266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t *temp = (int64_t *)allocateMemory( 267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru sizeof(int64_t) * newlength, status); 268b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *destinationlength = newlength; 275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru destination = temp; 276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru destination[offset] = value; 279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 280c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return destination; 281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing the ce table for a pattern. 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Stores non-ignorable collation keys. 286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Table size will be estimated by the size of the pattern text. Table 287b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* expansion will be perform as we go along. Adding 1 to ensure that the table 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size definitely increases. 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 291b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 293b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return total number of expansions 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 296b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint16_t initializePatternCETable(UStringSearch *strsrch, 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *cetable = pattern->CEBuffer; 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t patternlength = pattern->textLength; 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 304b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == NULL) { 306b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru coleiter = ucol_openElements(strsrch->collator, pattern->text, 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength, status); 308b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // status will be checked in ucol_next(..) later and if it is an 309b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // returned. 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->utilIter = coleiter; 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_init_collIterate(strsrch->collator, pattern->text, 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pattern->textLength, 31650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &coleiter->iteratordata_, 31750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho status); 31850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 31950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(*status)) { 32050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 322b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pattern->CE != cetable && pattern->CE) { 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(pattern->CE); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 326b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t offset = 0; 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t result = 0; 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce; 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SUCCESS(*status)) { 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t newce = getCE(strsrch, ce); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (newce) { 335b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru newce, 337b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength - ucol_getOffset(coleiter) + 1, 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset ++; 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cetable != temp && cetable != pattern->CEBuffer) { 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(cetable); 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cetable = temp; 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cetable[offset] = 0; 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pattern->CE = cetable; 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pattern->CELength = offset; 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 359c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Initializing the pce table for a pattern. 360c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Stores non-ignorable collation keys. 361b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Table size will be estimated by the size of the pattern text. Table 362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* expansion will be perform as we go along. Adding 1 to ensure that the table 363c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* size definitely increases. 364c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Internal method, status assumed to be a success. 365c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param strsrch string search data 366b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 367c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* method, status assumed to be success when passed in. 368b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return total number of expansions 369c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/ 370c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic 371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint16_t initializePatternPCETable(UStringSearch *strsrch, 372c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode *status) 373c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 374c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 375c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t *pcetable = pattern->PCEBuffer; 377c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t patternlength = pattern->textLength; 378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 379b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 380c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (coleiter == NULL) { 381b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru coleiter = ucol_openElements(strsrch->collator, pattern->text, 382c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru patternlength, status); 383b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // status will be checked in ucol_next(..) later and if it is an 384b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 385c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // returned. 386c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->utilIter = coleiter; 387c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 388c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_collIterate(strsrch->collator, pattern->text, 389c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pattern->textLength, 39050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &coleiter->iteratordata_, 39150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho status); 39250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 39350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(*status)) { 39450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 395c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 396b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 397c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (pattern->PCE != pcetable && pattern->PCE != NULL) { 398c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(pattern->PCE); 399c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 401c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint16_t offset = 0; 402c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint16_t result = 0; 403c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t pce; 404c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 405c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(coleiter); 406c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 407c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** Should processed CEs be signed or unsigned? 408b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // ** (the rest of the code in this file seems to play fast-and-loose with 409c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 410c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 411c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_SUCCESS(*status)) { 412b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 413c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pce, 414b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength - ucol_getOffset(coleiter) + 1, 415c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru status); 416c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 417c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 418c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 419c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 420c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 421c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offset += 1; 422c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 423c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (pcetable != temp && pcetable != pattern->PCEBuffer) { 424c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(pcetable); 425c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 427c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pcetable = temp; 428c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 429c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 430c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 431c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pcetable[offset] = 0; 432c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pattern->PCE = pcetable; 433c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pattern->PCELength = offset; 434c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 435c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return result; 436c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 437c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 438c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializes the pattern struct. 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch UStringSearch data storage 442b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return expansionsize the total expansion size of the pattern 445b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/ 446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 447b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *patterntext = pattern->text; 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = pattern->textLength; 452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t index = 0; 453b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Since the strength is primary, accents are ignored in the pattern. 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->strength == UCOL_PRIMARY) { 456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasPrefixAccents = 0; 457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasSuffixAccents = 0; 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho SECOND_LAST_BYTE_SHIFT_; 461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho index = length; 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UTF_BACK_1(patterntext, 0, index); 463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho LAST_BYTE_MASK_; 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 466c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** HACK ** 468c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (strsrch->pattern.PCE != NULL) { 469c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 470c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(strsrch->pattern.PCE); 471c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 472c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 473c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->pattern.PCE = NULL; 474c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 475c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since intializePattern is an internal method status is a success. 477b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return initializePatternCETable(strsrch, status); 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing shift tables, with the default values. 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If a corresponding default value is 0, the shift table is not set. 483b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param shift table for forwards shift 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param backshift table for backwards shift 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param cetable table containing pattern ce 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param cesize size of the pattern ces 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param expansionsize total size of the expansions 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param defaultforward the default forward value 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param defaultbackward the default backward value 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 492b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void setShiftTable(int16_t shift[], int16_t backshift[], 493b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *cetable, int32_t cesize, 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t expansionsize, 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t defaultforward, 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t defaultbackward) 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 498b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // estimate the value to shift. to do that we estimate the smallest 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // number of characters to give the relevant ces, ie approximately 500b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // the number of ces minus their expansion, since expansions can come 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // from a character. 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count; 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift[count] = defaultforward; 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cesize --; // down to the last index 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < cesize; count ++) { 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // number of ces from right of array to the count 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int temp = defaultforward - count - 1; 510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift[hash(cetable[count])] = temp > 1 ? temp : 1; 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift[hash(cetable[cesize])] = 1; 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for ignorables we just shift by one. see test examples. 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift[hash(0)] = 1; 515b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru backshift[count] = defaultbackward; 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = cesize; count > 0; count --) { 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the original value count does not seem to work 521b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru backshift[hash(cetable[count])] = count > expansionsize ? 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (int16_t)(count - expansionsize) : 1; 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru backshift[hash(cetable[0])] = 1; 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru backshift[hash(0)] = 1; 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Building of the pattern collation element list and the boyer moore strsrch 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* table. 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The canonical match will only be performed after the default match fails. 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* For both cases we need to remember the size of the composed and decomposed 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* versions of the string. Since the Boyer-Moore shift calculations shifts by 534b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* a number of characters in the text and tries to match the pattern from that 535b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* offset, the shift value can not be too large in case we miss some 536b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* characters. To choose a right shift size, we estimate the NFC form of the 537b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* and use its size as a shift guide. The NFC form should be the small 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* possible representation of the pattern. Anyways, we'll err on the smaller 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift size. Hence the calculation for minlength. 540b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Canonical match will be performed slightly differently. We'll split the 541b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* pattern into 3 parts, the prefix accents (PA), the middle string bounded by 542b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the first and last base character (MS), the ending accents (EA). Matches 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* will be done on MS first, and only when we match MS then some processing 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* will be required for the prefix and end accents in order to determine if 545b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* they match PA and EA. Hence the default shift values 546b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* for the canonical match will take the size of either end's accent into 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* consideration. Forwards search will take the end accents into consideration 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* for the default shift values and the backwards search will take the prefix 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accents into consideration. 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If pattern has no non-ignorable ce, we return a illegal argument error. 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch UStringSearch data storage 553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status for output errors if it occurs, status is assumed to be a 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* success when it is passed in. 555b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/ 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 557b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void initialize(UStringSearch *strsrch, UErrorCode *status) 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 559b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int16_t expandlength = initializePattern(strsrch, status); 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &strsrch->pattern; 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t cesize = pattern->CELength; 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 564b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int16_t minlength = cesize > expandlength 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ? (int16_t)cesize - expandlength : 1; 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pattern->defaultShiftSize = minlength; 567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setShiftTable(pattern->shift, pattern->backShift, pattern->CE, 568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cesize, expandlength, minlength, minlength); 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.defaultShiftSize = 0; 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 576b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Check to make sure that the match length is at the end of the character by 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* using the breakiterator. 578b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param strsrch string search data 579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start target text start offset 580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end target text end offset 581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 583b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *end) 585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiterator) { 589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t matchend = *end; 590b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //int32_t matchstart = *start; 591b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 592b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!ubrk_isBoundary(breakiterator, matchend)) { 593b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *end = ubrk_following(breakiterator, matchend); 594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 595b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 596b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /* Check the start of the matched text to make sure it doesn't have any accents 597b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * before it. This code may not be necessary and so it is commented out */ 598b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 599b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *start = ubrk_preceding(breakiterator, matchstart); 600b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }*/ 601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 606b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Determine whether the target text in UStringSearch bounded by the offset 607b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* start and end is one or more whole units of text as 608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* determined by the breakiterator in UStringSearch. 609b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param strsrch string search data 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start target text start offset 611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end target text end offset 612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 614b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end) 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //TODO: Add here. 620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiterator) { 621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t startindex = ubrk_first(breakiterator); 622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t endindex = ubrk_last(breakiterator); 623b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // out-of-range indexes are never boundary positions 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (start < startindex || start > endindex || 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end < startindex || end > endindex) { 627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 629b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // otherwise, we can use following() on the position before the 630b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // specified one and return true of the position we get back is the 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // one the user specified 632b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool result = (start == startindex || 633b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_following(breakiterator, start - 1) == start) && 634b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (end == endindex || 635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ubrk_following(breakiterator, end - 1) == end); 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result) { 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // iterates the individual ces 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 639b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text = strsrch->search->text + 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start; 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, text, end - start, &status); 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int32_t count = 0; count < strsrch->pattern.CELength; 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++) { 645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { 651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nextce = ucol_next(coleiter, &status); 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ucol_getOffset(coleiter) == (end - start) 656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextce = ucol_next(coleiter, &status); 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) == (end - start) 660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && nextce != UCOL_NULLORDER) { 661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra collation elements at the end of the match 662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 672b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the next base character offset if current offset is an accent, 673b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* or the current offset if the current character contains a base character. 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accents the following base character will be returned 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the next base character or the current offset 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if the current character is contains a base character. 680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 682b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextBaseOffset(const UChar *text, 683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength) 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset < textlength) { 687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 689b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (temp < textlength) { 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = temp; 691b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((getFCD(text, &temp, textlength) >> 692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru SECOND_LAST_BYTE_SHIFT_) == 0) { 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textlength; 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the next base character offset depending on the string search pattern 704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* data 705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset, one offset away from the last character 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* to search for. 708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return start index of the next base character or the current offset 709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if the current character is contains a base character. 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 712b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 716b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents && 717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset < textlength) { 718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, temp); 721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return getNextBaseOffset(text, textoffset, textlength); 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Shifting the collation element iterator position forward to prepare for 730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a following match. If the last character is a unsafe character, we'll only 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift by 1 to capture contractions, normalization etc. 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text strsrch string search data 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset start text position to do search 735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param ce the text ce which failed the match. 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param patternceindex index of the ce within the pattern ce buffer which 737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* failed the match 738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return final offset 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t shiftForward(UStringSearch *strsrch, 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce, 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex) 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER) { 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t shift = pattern->shift[hash(ce)]; 749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is to adjust for characters in the middle of the 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // substring for matching that failed. 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t adjust = pattern->CELength - patternceindex; 752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (adjust > 1 && shift >= adjust) { 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift -= adjust - 1; 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset += shift; 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset += pattern->defaultShiftSize; 759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 760b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check for unsafe characters 763b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // * if it is the start or middle of a contraction: to be done after 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a initial match is found 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * thai or lao base consonant character: similar to contraction 766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * high surrogate character: similar to contraction 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * next character is a accent: shift to the next base character 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 770c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 773b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* sets match not found 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 777b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void setMatchNotFound(UStringSearch *strsrch) 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this method resets the match result regardless of the error status. 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, 0); 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 790c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the offset to the next safe point in text. 793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ie. not the middle of a contraction, swappable characters or supplementary 794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters. 795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param collator collation sata 796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string to work with 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in string 798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return offset to the next safe character 800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 802b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextSafeOffset(const UCollator *collator, 803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength) 806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; // first contraction character 808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (result != textlength && ucol_unsafeCP(text[result], collator)) { 809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return result; 812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 814b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/** 815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This checks for accents in the potential match started with a . 816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* composite character. 817b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This is really painful... we have to check that composite character do not 818b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* have any extra accents. We have to normalize the potential match and find 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the immediate decomposed character before the match. 820b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* The first composite character would have been taken care of by the fcd 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checks in checkForwardExactMatch. 822b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This is the slow path after the fcd of the first character and 823b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the last character has been checked by checkForwardExactMatch and we 824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* determine that the potential match has extra non-ignorable preceding 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ces. 826b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success, caller should check status 830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method 831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start index of the potential unfriendly composite character 833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end index of the potential unfriendly composite character 834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any. 835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if there is non-ignorable accents before at the beginning 836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of the match, FALSE otherwise. 837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 841b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool result = FALSE; 845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = end - start; 847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = 0; 848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text + start; 849b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_FWD_1(text, offset, length); 851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only concerned with the first composite character 852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 853b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t safeoffset = getNextSafeOffset(strsrch->collator, 854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru text, 0, length); 855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (safeoffset != length) { 856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safeoffset ++; 857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *norm = NULL; 859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 860b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 861b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru buffer, INITIAL_ARRAY_SIZE_, 862b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status); 863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (size >= INITIAL_ARRAY_SIZE_) { 867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 869b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if allocation failed, status will be set to 870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checks for it. 872b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru size, status); 874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) && norm != NULL) { 875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(norm); 876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru norm = buffer; 881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, norm, size, status); 885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t firstce = strsrch->pattern.CE[0]; 886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool ignorable = TRUE; 887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ce = UCOL_IGNORABLE; 888c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = ucol_getOffset(coleiter); 890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != firstce && ce != UCOL_IGNORABLE) { 891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ignorable = FALSE; 892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = ucol_next(coleiter, status); 894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 codepoint; 896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_PREV_CHAR(norm, 0, offset, codepoint); 897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = !ignorable && (u_getCombiningClass(codepoint) != 0); 898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (norm != buffer) { 900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(norm); 901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 909b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Used by exact matches, checks if there are accents before the match. 910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This is really painful... we have to check that composite characters at 911b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the start of the matches have to not have any extra accents. 912b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We check the FCD of the character first, if it starts with an accent and 913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the first pattern ce does not match the first ce of the character, we bail. 914b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Otherwise we try normalizing the first composite 915b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* character and find the immediate decomposed character before the match to 916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* see if it is an non-ignorable accent. 917b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Now normalizing the first composite character is enough because we ensure 918b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* that when the match is passed in here with extra beginning ces, the 919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* first or last ce that match has to occur within the first character. 920b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 924b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param start offset 925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset 926b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return TRUE if there are accents on either side of the match, 927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* FALSE otherwise 928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 931b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we have been iterating forwards previously 937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ignorable = TRUE; 938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = strsrch->pattern.CE[0]; 939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ce != firstce) { 946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_IGNORABLE) { 947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ignorable = FALSE; 948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, &status)); 950c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ignorable && inNormBuf(coleiter)) { 955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within normalization buffer, discontiguous handled here 956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within text 960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = start; 961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // original code 962b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accent = (getFCD(strsrch->search->text, &temp, 963b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // strsrch->search->textLength) 964b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // >> SECOND_LAST_BYTE_SHIFT_); 965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // however this code does not work well with VC7 .net in release mode. 966b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // maybe the inlines for getFCD combined with shifting has bugs in 967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // VC7. anyways this is a work around. 968b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool accent = getFCD(strsrch->search->text, &temp, 969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength) > 0xFF; 970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!accent) { 971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return checkExtraMatchAccents(strsrch, start, end, &status); 972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ignorable) { 974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (start > 0) { 977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = start; 978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(strsrch->search->text, 0, temp); 979b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getFCD(strsrch->search->text, &temp, 980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength) & LAST_BYTE_MASK_) { 981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = ucol_previous(coleiter, &status); 983b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status) || 984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 990b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Used by exact matches, checks if there are accents bounding the match. 996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note this is the initial boundary check. If the potential match 997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* starts or ends with composite characters, the accents in those 998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters will be determined later. 999b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Not doing backwards iteration here, since discontiguos contraction for 1000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* backwards collation element iterator, use up too many characters. 1001b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u030A ring in \u01FA A ring above and acute, 1002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* should fail since there is a acute at the end of \u01FA 1003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 1004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of match 1006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end end offset of the match 1007b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return TRUE if there are accents on either side of the match, 1008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* FALSE otherwise 1009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1011b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1012b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 1013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents) { 1015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = end; 1017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, temp); 1019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = strsrch->pattern.CE[0]; 1021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1023b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t ce; 1024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 1025c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1026c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 1; 1031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count < strsrch->pattern.CELength) { 1032b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getCE(strsrch, ucol_next(coleiter, &status)) 1033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru == UCOL_IGNORABLE) { 1034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Thai can give an ignorable here. 1035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1042b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1043b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ce = ucol_next(coleiter, &status); 1044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1048b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ce = getCE(strsrch, ce); 1049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) <= end) { 1052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1062c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 1063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks if the offset runs out of the text string 1066b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param offset 1067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength of the text string 1068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if offset is out of bounds, FALSE otherwise 1069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return offset < 0 || offset > textlength; 1074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks for identical match 1078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of possible match 1080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of possible match 1081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if identical match is found 1082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1084b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1085b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 1086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->strength != UCOL_IDENTICAL) { 1088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 109150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Note: We could use Normalizer::compare() or similar, but for short strings 109250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // which may not be in FCD it might be faster to just NFD them. 109350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode status = U_ZERO_ERROR; 109450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString t2, p2; 109550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->nfd->normalize( 109650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 109750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->nfd->normalize( 109850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // return FALSE if NFD failed 110050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_SUCCESS(status) && t2 == p2; 1101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1103b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if BOYER_MOORE 1104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks to see if the match is repeated 1106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start new match start index 1108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end new match end index 1109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the the match is repeated, FALSE otherwise 1110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline UBool checkRepeatedMatch(UStringSearch *strsrch, 1113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 1114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end) 1115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastmatchindex = strsrch->search->matchedIndex; 1117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool result; 1118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastmatchindex == USEARCH_DONE) { 1119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 1122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start <= lastmatchindex; 1123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start >= lastmatchindex; 1126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!result && !strsrch->search->isOverlap) { 1128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 1129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start < lastmatchindex + strsrch->search->matchedLength; 1130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = end > lastmatchindex; 1133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the collation element iterator's current offset. 1140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 1141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param forwards flag TRUE if we are moving in th forwards direction 1142b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return current offset 1143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool forwards) 1147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(coleiter); 1149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // intricacies of the the backwards collation element iterator 1150c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 1152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1157b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 1158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match ends with a partial contraction we fail. 1159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts too far off (because of backwards iteration) we try to 1160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* chip off the extra characters depending on whether a breakiterator has 1161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* been used. 1162b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, error assumed to be success, caller has to check status 1163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 1166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 1167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 1169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkNextExactContractionMatch(UStringSearch *strsrch, 1173b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 1174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 1175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1178c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t temp = *start; 1179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1181b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either ends of the match contains potential 1182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 1183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The start contraction needs to be checked since ucol_previous dumps 1184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // all characters till the first safe character into the buffer. 1185b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // *start + 1 is used to test for the unsafe characters instead of *start 1186b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // because ucol_prev takes all unsafe characters till the first safe 1187b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // character ie *start. so by testing *start + 1, we can estimate if 1188b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // excess prefix characters has been included in the potential search 1189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // results. 1190b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1191b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (*start + 1 < textlength 1192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && ucol_unsafeCP(text[*start + 1], collator))) { 1193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionPrefix(coleiter); 1194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 1195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *start); 1196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 1197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce, caused by setOffset. 1198b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // since backward contraction/expansion may have extra ces if we 1199b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // are in the normalization buffer, hasAccentsBeforeMatch would 1200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have taken care of it. 1201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 1202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only looking for acute and ring \u030A and \u0301, we'll 1203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have to skip the first ce in the expansion buffer. 1204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_next(coleiter, status); 1205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 1209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 1213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 1216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 1217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 1218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count < patterncelength) { 1219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count]) { 1228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*end) ++; 1229b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *end = getNextUStringSearchBaseOffset(strsrch, *end); 1230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1234b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 1235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 1240b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 1241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 1242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 1243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 1244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> exact matches has no extra accents 1245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matchesb 1246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 1247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 1248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 1249b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 1253b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated end offset of the match or the new start 1254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 1255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1259b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkNextExactMatch(UStringSearch *strsrch, 1260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *textoffset, UErrorCode *status) 1261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1263b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t start = getColElemIterOffset(coleiter, FALSE); 1264b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 1270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!isBreakUnit(strsrch, start, *textoffset) || 1271b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru checkRepeatedMatch(strsrch, start, *textoffset) || 1272b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, start, *textoffset) || 1274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) ++; 1277b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Add breakiterator boundary check for primary strength search. 1282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho checkBreakBoundary(strsrch, &start, textoffset); 1284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1285b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // totally match, we will get rid of the ending ignorables. 1287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = start; 1288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = *textoffset - start; 1289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1293b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the previous base character offset, or the current offset if the 1294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* current character is a base character 1295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 1296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset one offset after the current character 1297b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return the offset of the next character after the base character or the first 1298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* composed character with accents 1299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1301b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getPreviousBaseOffset(const UChar *text, 1302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset > 0) { 1305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 1306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; 1307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, textoffset); 1308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 1309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t fcd = getFCD(text, &temp, result); 1310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fcd & LAST_BYTE_MASK_) { 1312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset == 0) { 1317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 1318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Getting the indexes of the accents that are not blocked in the argument 1326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accent array 1327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param accents array of accents in nfd terminated by a 0. 1328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param accentsindex array of indexes of the accents that are not blocked 1329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t index = 0; 1334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = u_strlen(accents); 1335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 codepoint = 0; 1336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int cclass = 0; 1337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int result = 0; 1338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp; 1339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (index < length) { 1340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = index; 1341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_NEXT_CHAR(accents, index, length, codepoint); 1342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_getCombiningClass(codepoint) != cclass) { 1343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cclass = u_getCombiningClass(codepoint); 1344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex[result] = temp; 1345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 1346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex[result] = length; 1349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Appends 3 UChar arrays to a destination array. 1354b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 1355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* manually deallocate the newly allocated array. 1356b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1357b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 1358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size destinationlength. 1359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destination target array 1360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destinationlength target array size, returning the appended length 1361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source1 null-terminated first array 1362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source2 second array 1363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source2length length of seond array 1364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source3 null-terminated third array 1365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 1366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 1367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1369b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UChar * addToUCharArray( UChar *destination, 1370b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *destinationlength, 1371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *source1, 1372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *source2, 1373b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t source2length, 1374b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *source3, 1375b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 1376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t source1length = source1 ? u_strlen(source1) : 0; 1378b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t source3length = source3 ? u_strlen(source3) : 0; 1379b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (*destinationlength < source1length + source2length + source3length + 1380b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1) 1381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination = (UChar *)allocateMemory( 1383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (source1length + source2length + source3length + 1) * sizeof(UChar), 1384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1385b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if error allocating memory, status will be 1386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_MEMORY_ALLOCATION_ERROR 1387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = 0; 1389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 1390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source1length != 0) { 1393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source2length != 0) { 1396b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(destination + source1length, source2, 1397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UChar) * source2length); 1398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source3length != 0) { 1400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(destination + source1length + source2length, source3, 1401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UChar) * source3length); 1402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = source1length + source2length + source3length; 1404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return destination; 1405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Running through a collation element iterator to see if the contents matches 1409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* pattern in string search data 1410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 1412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a match if found, FALSE otherwise 1413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1415b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkCollationMatch(const UStringSearch *strsrch, 1416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter) 1417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int patternceindex = strsrch->pattern.CELength; 1419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 1420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (patternceindex > 0) { 1422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status) || ce != *patternce) { 1427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternce ++; 1430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 1431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Rearranges the front accents to try matching. 1437b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Prefix accents in the text will be grouped according to their combining 1438b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 1439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 1440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1442b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 1444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 1445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status is assumed to be success, caller has to check status 1446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search match 1448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start first offset of the accents to start searching 1449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end start of the last accent set 1450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the starting 1452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all preceding accents. 1453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1455b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 1457b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 1458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tempstart = start; 1463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // die... failed at a base character 1466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = getPreviousBaseOffset(text, tempstart); 1471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 1473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 1474b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1475b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru INITIAL_ARRAY_SIZE_, status); 1476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1479b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1480b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1481b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsize = getUnblockedAccentIndex(accents, 1482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex); 1483b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (accentsize - 1)) - 1; 1484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 1485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 1486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 1487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalPrefixAccents; 1488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 1489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 1490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 1491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 1493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 1494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= accentsize - 1; i ++) { 1495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (accentsize - i - 1); 1496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 1497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 1499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 1503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchsize = INITIAL_ARRAY_SIZE_; 1504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *match = addToUCharArray(buffer, &matchsize, 1505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents, 1506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text + offset, 1507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end - offset, 1508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents, 1509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1510b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing. 1512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // run the collator iterator through this match 1513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, match, matchsize, status); 1514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 1515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkCollationMatch(strsrch, coleiter)) { 1516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match != buffer) { 1517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(match); 1518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return start; 1520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the offset to the safe point in text before textoffset. 1529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ie. not the middle of a contraction, swappable characters or supplementary 1530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters. 1531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param collator collation sata 1532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string to work with 1533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in string 1534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 1535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return offset to the previous safe character 1536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1538b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint32_t getPreviousSafeOffset(const UCollator *collator, 1539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 1540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; // first contraction character 1543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result --; 1545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != 0) { 1547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first contraction character is consider unsafe here 1548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result --; 1549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1550b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return result; 1551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Cleaning up after we passed the safe zone 1555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param safetext safe text array 1557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param safebuffer safe text buffer 1558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator for safe text 1559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safebuffer) 1563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1564b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(safetext); 1567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Take the rearranged end accents and tries matching. If match failed at 1572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a seperate preceding set of accents (seperated from the rearranged on by 1573b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* at least a base character) then we rearrange the preceding accents and 1574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tries matching again. 1575b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We allow skipping of the ends of the accent set if the ces do not match. 1576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* However if the failure is found before the accent set, it fails. 1577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset of the start of the rearranged accent 1581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the starting 1583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all preceding accents. 1584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 1588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safelength = 0; 1593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safetext; 1594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safetextlength; 1595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 1597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safeoffset = textoffset; 1598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1599b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru collator)) { 1601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safelength = textoffset - safeoffset; 1603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = INITIAL_ARRAY_SIZE_; 1604b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1605b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru text + safeoffset, safelength, 1606b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents, 1607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = strsrch->canonicalSuffixAccents; 1612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing 1615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, safetext, safetextlength, status); 1616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 1617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *ce = strsrch->pattern.CE; 1619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t celength = strsrch->pattern.CELength; 1620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ceindex = celength - 1; 1621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isSafe = TRUE; // indication flag for position in safe zone 1622b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ceindex >= 0) { 1624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textce = ucol_previous(coleiter, status); 1625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == UCOL_NULLORDER) { 1632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check if we have passed the safe buffer 1633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == strsrch->textIter) { 1634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = safebuffer; 1639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru coleiter = strsrch->textIter; 1640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, safeoffset); 1641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at the start of the loop 1642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru isSafe = FALSE; 1643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textce = getCE(strsrch, textce); 1646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do the beginning stuff 1648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe && failedoffset >= safelength) { 1650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alas... no hope. failed at rearranged accent set 1651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset += safeoffset; 1657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1659b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // try rearranging the front accents 1661b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t result = doNextCanonicalPrefixMatch(strsrch, 1662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset, textoffset, status); 1663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != USEARCH_DONE) { 1664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 1665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 1666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == ce[ceindex]) { 1674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceindex --; 1675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set offset here 1678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = getColElemIterOffset(coleiter, FALSE); 1680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets the text iterator here with the correct expansion and offset 1681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t leftoverces = getExpansionPrefix(coleiter); 1682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1683b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (result >= safelength) { 1684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset; 1685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result += safeoffset; 1688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 1690b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->textIter->iteratordata_.toReturn = 1691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setExpansionPrefix(strsrch->textIter, leftoverces); 1692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1694b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1695b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ucol_getOffset(coleiter); 1696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Trying out the substring and sees if it can be a canonical match. 1700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This will try normalizing the end accents and arranging them into canonical 1701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* equivalents and check their corresponding ces with the pattern ce. 1702b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Suffix accents in the text will be grouped according to their combining 1703b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 1704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 1705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1707b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 1709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 1710b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1713b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param textoffset end offset in the collation element text that ends with 1714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the accents to be rearranged 1715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 1716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1719b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool doNextCanonicalMatch(UStringSearch *strsrch, 1720b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t textoffset, 1721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 1725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, temp); 1726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getColElemIterOffset(coleiter, FALSE); 1729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 1730b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, offset); 1734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->pattern.hasSuffixAccents) { 1741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 1745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // offset to the last base character in substring to search 1746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 1748b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0, accents, INITIAL_ARRAY_SIZE_, status); 1750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 1751b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 power n - 1 plus the full set of accents 1756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = (2 << (size - 1)) - 1; 1757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 1758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalSuffixAccents; 1759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 1760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 1761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 1762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 1764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 1765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= size - 1; i ++) { 1766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (size - i - 1); 1767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 1768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 1770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 1774b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != USEARCH_DONE) { 1777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; // match found 1778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1785b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Gets the previous base character offset depending on the string search 1786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* pattern data 1787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset, current character 1789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the offset of the next character after this base character or itself 1790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if it is a composed character with accents 1791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1793b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = textoffset; 1799b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getFCD(text, &offset, strsrch->search->textLength) >> 1800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru SECOND_LAST_BYTE_SHIFT_) { 1801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return getPreviousBaseOffset(text, textoffset); 1802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1808b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 1809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match ends with a partial contraction we fail. 1810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts too far off (because of backwards iteration) we try to 1811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* chip off the extra characters 1812b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 1816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 1817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 1819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1821b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1822b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 1823b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, 1824b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 1825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *start; 1829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1831b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either ends of the match contains potential 1832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 1833b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1834b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (*start + 1 < textlength 1835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && ucol_unsafeCP(text[*start + 1], collator))) { 1836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionPrefix(coleiter); 1837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 1838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *start); 1839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 1840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce, caused by setOffset. 1841b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // since backward contraction/expansion may have extra ces if we 1842b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // are in the normalization buffer, hasAccentsBeforeMatch would 1843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have taken care of it. 1844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 1845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only looking for acute and ring \u030A and \u0301, we'll 1846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have to skip the first ce in the expansion buffer. 1847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_next(coleiter, status); 1848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 1852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 1856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 1859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 1860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 1861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count < patterncelength) { 1863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 1865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_next returns UCOL_NULLORDER 1866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count == 0 && ce != patternce[0]) { 1875b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accents may have extra starting ces, this occurs when a 1876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pure accent pattern is matched without rearrangement 1877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // text \u0325\u0300 and looking for \u0300 1878b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t expected = patternce[0]; 1879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, status)); 1881b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (U_SUCCESS(*status) && ce != expected && 1882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != UCOL_NULLORDER && 1883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter) <= *end) { 1884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, status)); 1885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count]) { 1889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*end) ++; 1890b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *end = getNextUStringSearchBaseOffset(strsrch, *end); 1891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1895b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 1896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 1901b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 1902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 1903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 1904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 1905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 1906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 1907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 1908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 1909b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check the 1910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* status before calling this method. 1911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 1913b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated end offset of the match or the new start 1914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 1915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1919b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1920b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 1921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 1924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we have a canonical accent match 1926b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->pattern.hasSuffixAccents && 1927b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents[0]) || 1928b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->pattern.hasPrefixAccents && 1929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0])) { 1930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch, 1932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter)); 1933b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->matchedLength = *textoffset - 1934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex; 1935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = getColElemIterOffset(coleiter, FALSE); 1939b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status) || U_FAILURE(*status)) { 1941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1943b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = getPreviousUStringSearchBaseOffset(strsrch, start); 1945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 1946b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, start, *textoffset) || 1947b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !isBreakUnit(strsrch, start, *textoffset) || 1948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, start, *textoffset)) { 1949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) ++; 1950b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength); 1952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1954b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = start; 1956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = *textoffset - start; 1957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Shifting the collation element iterator position forward to prepare for 1962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a preceding match. If the first character is a unsafe character, we'll only 1963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift by 1 to capture contractions, normalization etc. 1964b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text strsrch string search data 1967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset start text position to do search 1968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param ce the text ce which failed the match. 1969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param patternceindex index of the ce within the pattern ce buffer which 1970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* failed the match 1971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return final offset 1972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t reverseShift(UStringSearch *strsrch, 1975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 1976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce, 1977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex) 1978b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 1979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isOverlap) { 1980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset != strsrch->search->textLength) { 1981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset --; 1982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= strsrch->pattern.defaultShiftSize; 1985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER) { 1989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t shift = strsrch->pattern.backShift[hash(ce)]; 1990b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1991b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is to adjust for characters in the middle of the substring 1992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for matching that failed. 1993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t adjust = patternceindex; 1994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (adjust > 1 && shift > adjust) { 1995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift -= adjust - 1; 1996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= shift; 1998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= strsrch->pattern.defaultShiftSize; 2001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2002b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 2004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 2005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2008b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 2009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts with a partial contraction we fail. 2010b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 2014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 2015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 2017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2019b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2020b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 2021b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 2022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *end; 2026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2028b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either if the start of the match contains potential 2029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 2030b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Since we used ucol_next while previously looking for the potential 2031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match, this guarantees that our end will not be a partial contraction, 2032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or a partial supplementary character. 2033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionSuffix(coleiter); 2035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 2036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *end); 2037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && expansion > 0) { 2038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce 2039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since forward contraction/expansion may have extra ces 2040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are in the normalization buffer, hasAccentsBeforeMatch 2041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // would have taken care of it. 2042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 2043b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // we are only looking for A ring A\u030A, we'll have to skip the 2044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // last ce in the expansion buffer 2045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_previous(coleiter, status); 2046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 2050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 2054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 2057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 2058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = patterncelength; 2059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count > 0) { 2060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 2062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_previous returns UCOL_NULLORDER 2063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 2064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2066b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (expandflag && count == 0 && 2067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE) != temp) { 2068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*start) --; 2073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = getPreviousBaseOffset(text, *start); 2074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2078b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 2084b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 2085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 2086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the current match does not repeat the last match 2087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 2088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> exact matches has no extra accents 2089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 2090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 2091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the preceding character. 2092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2095b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param collator 2096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 2097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 2098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 2099b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated start offset of the match or the new start 2100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 2101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2105b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2106b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 2107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 2110b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end = ucol_getOffset(strsrch->textIter); 2111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru || U_FAILURE(*status)) { 2113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2115b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 2117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the old match 2118b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, *textoffset, end) || 2119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !isBreakUnit(strsrch, *textoffset, end) || 2120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2121b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !checkIdentical(strsrch, *textoffset, end) || 2122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) --; 2124b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getPreviousBaseOffset(strsrch->search->text, 2125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *textoffset); 2126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2128b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Add breakiterator boundary check for primary strength search. 2130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho checkBreakBoundary(strsrch, textoffset, &end); 2132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2133b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = end - *textoffset; 2136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Rearranges the end accents to try matching. 2141b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Suffix accents in the text will be grouped according to their combining 2142b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 2143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 2144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 2148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 2149b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, user has to check status 2150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search match 2152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of the first base character 2153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end start of the last accent set 2154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the ending 2156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all following accents. 2157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2159b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 2161b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 2162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tempend = end; 2166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, tempend); 2168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru LAST_BYTE_MASK_)) { 2170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // die... failed at a base character 2171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end = getNextBaseOffset(text, end, strsrch->search->textLength); 2174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 2177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getPreviousBaseOffset(text, end); 2178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 2179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2180b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru INITIAL_ARRAY_SIZE_, status); 2181b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2182b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2183b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsize = getUnblockedAccentIndex(accents, 2184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex); 2185b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (accentsize - 1)) - 1; 2186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 2187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 2188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 2189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalSuffixAccents; 2190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 2191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 2192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 2193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 2195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 2196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= accentsize - 1; i ++) { 2197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (accentsize - i - 1); 2198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 2199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 2201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 2205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchsize = INITIAL_ARRAY_SIZE_; 2206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *match = addToUCharArray(buffer, &matchsize, 2207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents, 2208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text + start, 2209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset - start, 2210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents, 2211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 2212b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // run the collator iterator through this match 2214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure ucol_setText does nothing 2215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, match, matchsize, status); 2216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkCollationMatch(strsrch, coleiter)) { 2218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match != buffer) { 2219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(match); 2220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return end; 2222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Take the rearranged start accents and tries matching. If match failed at 2232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a seperate following set of accents (seperated from the rearranged on by 2233b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* at least a base character) then we rearrange the preceding accents and 2234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tries matching again. 2235b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We allow skipping of the ends of the accent set if the ces do not match. 2236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* However if the failure is found before the accent set, it fails. 2237b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset of the ends of the rearranged accent 2241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the ending 2243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all following accents. 2244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2246b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 2248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safelength = 0; 2253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safetext; 2254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safetextlength; 2255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safeoffset = textoffset; 2257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2258b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (textoffset && 2259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_strlen(strsrch->canonicalPrefixAccents) - 1 2261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ], collator)) { 2262b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safeoffset = getNextSafeOffset(collator, text, textoffset, 2263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength); 2264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safelength = safeoffset - textoffset; 2265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = INITIAL_ARRAY_SIZE_; 2266b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safetext = addToUCharArray(safebuffer, &safetextlength, 2267b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalPrefixAccents, 2268b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru text + textoffset, safelength, 2269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, status); 2270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = strsrch->canonicalPrefixAccents; 2274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 2277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing 2278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, safetext, safetextlength, status); 2279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 2280b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *ce = strsrch->pattern.CE; 2282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t celength = strsrch->pattern.CELength; 2283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ceindex = 0; 2284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isSafe = TRUE; // safe zone indication flag for position 2285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ceindex < celength) { 2288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textce = ucol_next(coleiter, status); 2289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == UCOL_NULLORDER) { 2296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check if we have passed the safe buffer 2297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == strsrch->textIter) { 2298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = safebuffer; 2303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru coleiter = strsrch->textIter; 2304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, safeoffset); 2305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at the start of the loop 2306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru isSafe = FALSE; 2307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textce = getCE(strsrch, textce); 2310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do the beginning stuff 2312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t failedoffset = ucol_getOffset(coleiter); 2313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe && failedoffset <= prefixlength) { 2314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alas... no hope. failed at rearranged accent set 2315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset = safeoffset - failedoffset; 2321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2323b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // try rearranging the end accents 2325b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset, failedoffset, status); 2327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != USEARCH_DONE) { 2328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 2329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 2330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == ce[ceindex]) { 2338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceindex ++; 2339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set offset here 2342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(coleiter); 2344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets the text iterator here with the correct expansion and offset 2345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t leftoverces = getExpansionSuffix(coleiter); 2346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2347b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (result <= prefixlength) { 2348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset; 2349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset + (safeoffset - result); 2352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 2354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setExpansionSuffix(strsrch->textIter, leftoverces); 2355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2357b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2358b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ucol_getOffset(coleiter); 2359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Trying out the substring and sees if it can be a canonical match. 2363b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This will try normalizing the starting accents and arranging them into 2364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* canonical equivalents and check their corresponding ces with the pattern ce. 2365b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Prefix accents in the text will be grouped according to their combining 2366b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 2367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 2368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2370b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 2372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 2373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2376b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param textoffset start offset in the collation element text that starts 2377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* with the accents to be rearranged 2378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2383b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t textoffset, 2384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 2388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = ucol_getOffset(coleiter); 2392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents) { 2393b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset, status); 2395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, offset); 2397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->pattern.hasPrefixAccents) { 2404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 2408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // offset to the last base character in substring to search 2409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 2411b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2412b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0, accents, INITIAL_ARRAY_SIZE_, status); 2413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop 2414b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 power n - 1 plus the full set of accents 2419b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (size - 1)) - 1; 2420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 2421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalPrefixAccents; 2422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 2423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 2424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 2425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 2427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 2428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= size - 1; i ++) { 2429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (size - i - 1); 2430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 2431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 2433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 2437b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru baseoffset, status); 2439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != USEARCH_DONE) { 2440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; // match found 2441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2448b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 2449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts with a partial contraction we fail. 2450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 2454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 2455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 2457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2459b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2460b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 2461b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 2462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *end; 2466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2468b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either if the start of the match contains potential 2469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 2470b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Since we used ucol_next while previously looking for the potential 2471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match, this guarantees that our end will not be a partial contraction, 2472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or a partial supplementary character. 2473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionSuffix(coleiter); 2475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 2476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *end); 2477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 2478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce 2479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since forward contraction/expansion may have extra ces 2480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are in the normalization buffer, hasAccentsBeforeMatch 2481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // would have taken care of it. 2482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 2483b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // we are only looking for A ring A\u030A, we'll have to skip the 2484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // last ce in the expansion buffer 2485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_previous(coleiter, status); 2486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 2490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 2494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 2497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 2498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = patterncelength; 2499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count > 0) { 2500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 2502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_previous returns UCOL_NULLORDER 2503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 2504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2506b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (expandflag && count == 0 && 2507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE) != temp) { 2508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2511b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (count == patterncelength && 2512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != patternce[patterncelength - 1]) { 2513b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accents may have extra starting ces, this occurs when a 2514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pure accent pattern is matched without rearrangement 2515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expected = patternce[patterncelength - 1]; 2516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(text, 0, *end); 2517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_previous(coleiter, status)); 2519b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (U_SUCCESS(*status) && ce != expected && 2520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != UCOL_NULLORDER && 2521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter) <= *start) { 2522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_previous(coleiter, status)); 2523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*start) --; 2528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = getPreviousBaseOffset(text, *start); 2529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2533b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 2539b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 2540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 2541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 2542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 2543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 2544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 2545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 2546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 2547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 2551b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated start offset of the match or the new start 2552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 2553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2557b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2558b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 2559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 2562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we have a canonical accent match 2564b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->pattern.hasSuffixAccents && 2565b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents[0]) || 2566b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->pattern.hasPrefixAccents && 2567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0])) { 2568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2569b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->matchedLength = 2570b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru getNextUStringSearchBaseOffset(strsrch, 2571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE)) 2572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru - *textoffset; 2573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = ucol_getOffset(coleiter); 2577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2578b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status) || 2579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_FAILURE(*status)) { 2580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end = getNextUStringSearchBaseOffset(strsrch, end); 2584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 2585b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, *textoffset, end) || 2586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !isBreakUnit(strsrch, *textoffset, end) || 2587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, *textoffset, end)) { 2588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) --; 2589b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getPreviousBaseOffset(strsrch->search->text, 2590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *textoffset); 2591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2593b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = end - *textoffset; 2596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 2599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// constructors and destructor ------------------------------------------- 2601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2602b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2603b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t patternlength, 2604b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text, 2605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *locale, 2607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2608b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 2609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if UCONFIG_NO_BREAK_ITERATION 2614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter != NULL) { 2615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (locale) { 2620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_open internally checks for status 2621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollator *collator = ucol_open(locale, status); 2622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern, text checks are done in usearch_openFromCollator 2623b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UStringSearch *result = usearch_openFromCollator(pattern, 2624b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength, text, textlength, 2625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru collator, breakiter, status); 2626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL || U_FAILURE(*status)) { 2628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (collator) { 2629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close(collator); 2630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ownCollator = TRUE; 2635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2643b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *pattern, 2644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternlength, 2645b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text, 2646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator, 2648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2649b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 2650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if UCONFIG_NO_BREAK_ITERATION 2655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter != NULL) { 2656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pattern == NULL || text == NULL || collator == NULL) { 2661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string search does not really work when numeric collation is turned on 2666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initializeFCD(status); 2673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UStringSearch *result; 2678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength == -1) { 2679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength = u_strlen(text); 2680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == -1) { 2682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength = u_strlen(pattern); 2683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength <= 0 || patternlength <= 0) { 2685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2688b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL) { 2691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 2692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->collator = collator; 2696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->strength = ucol_getStrength(collator); 2697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ceMask = getMask(result->strength); 2698b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->toShift = 2699b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_SHIFTED; 2701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->variableTop = ucol_getVariableTop(collator, status); 2702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 270350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho result->nfd = Normalizer2Factory::getNFDInstance(*status); 270450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(result); 2707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result->search == NULL) { 2712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 2713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(result); 2714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->text = text; 2718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->textLength = textlength; 2719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->pattern.text = pattern; 2721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->pattern.textLength = patternlength; 2722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->pattern.CE = NULL; 2723c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru result->pattern.PCE = NULL; 2724b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->breakIter = breakiter; 2726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2727b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter) { 2729b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_setText(breakiter, text, textlength, status); 2730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ownCollator = FALSE; 2734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->matchedLength = 0; 2735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->matchedIndex = USEARCH_DONE; 2736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru result->utilIter = NULL; 2737b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->textIter = ucol_openElements(collator, text, 2738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength, status); 2739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_close(result); 2741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isOverlap = FALSE; 2745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isCanonicalMatch = FALSE; 274650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho result->search->elementComparisonType = 0; 2747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isForwardSearching = TRUE; 2748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->reset = TRUE; 2749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(result, status); 2751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_close(result); 2754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && 2766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.CE) { 2767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(strsrch->pattern.CE); 2768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2769c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2770c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (strsrch->pattern.PCE != NULL && 2771c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 2772c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(strsrch->pattern.PCE); 2773c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2774c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_closeElements(strsrch->textIter); 2776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_closeElements(strsrch->utilIter); 2777c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ownCollator && strsrch->collator) { 2779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close((UCollator *)strsrch->collator); 2780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2781c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2782c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->internalBreakIter) { 2784b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_close(strsrch->search->internalBreakIter); 2785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2786c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 2787c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(strsrch->search); 2789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(strsrch); 2790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// set and get methods -------------------------------------------------- 2794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2795b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 2797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isOutOfBounds(strsrch->search->textLength, position)) { 2801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_INDEX_OUTOFBOUNDS_ERROR; 2802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, position); 2805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 2807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 2808b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->reset = FALSE; 2809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(strsrch->textIter); 2816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isOutOfBounds(strsrch->search->textLength, result)) { 2817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2823b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2824b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttribute attribute, 2826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttributeValue value, 2827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (attribute) 2831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_OVERLAP : 2833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_CANONICAL_MATCH : 2836b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FALSE); 2838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 283950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case USEARCH_ELEMENT_COMPARISON : 284050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 284150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = (int16_t)value; 284250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 284350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = 0; 284450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 284550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 2846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_ATTRIBUTE_COUNT : 2847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 2848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2855b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch, 2858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttribute attribute) 2859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (attribute) { 2862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_OVERLAP : 2863b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USEARCH_OFF); 2865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_CANONICAL_MATCH : 2866b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USEARCH_OFF); 286850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case USEARCH_ELEMENT_COMPARISON : 286950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 287050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int16_t value = strsrch->search->elementComparisonType; 287150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 287250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (USearchAttributeValue)value; 287350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 287450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return USEARCH_STANDARD_ELEMENT_COMPARISON; 287550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 287650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 2877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_ATTRIBUTE_COUNT : 2878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DEFAULT; 2879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DEFAULT; 2882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch) 2886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch == NULL) { 2888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->matchedIndex; 2891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2894b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2895b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar *result, 2896b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t resultCapacity, 2897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2902b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result == NULL)) { 2904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t copylength = strsrch->search->matchedLength; 2909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t copyindex = strsrch->search->matchedIndex; 2910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (copyindex == USEARCH_DONE) { 2911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_terminateUChars(result, resultCapacity, 0, status); 2912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (resultCapacity < copylength) { 2916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copylength = resultCapacity; 2917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (copylength > 0) { 2919b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(result, strsrch->search->text + copyindex, 2920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copylength * sizeof(UChar)); 2921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2922b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return u_terminateUChars(result, resultCapacity, 2923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength, status); 2924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2925b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch) 2928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->matchedLength; 2931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2937b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2942b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->breakIter = breakiter; 2943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter) { 2944b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_setText(breakiter, strsrch->search->text, 2945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength, status); 2946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2950b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UBreakIterator* U_EXPORT2 2951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruusearch_getBreakIterator(const UStringSearch *strsrch) 2952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->breakIter; 2955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2958b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2960b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2961b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 2963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2967b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch == NULL || text == NULL || textlength < -1 || 2968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength == 0) { 2969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength == -1) { 2973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength = u_strlen(text); 2974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text = text; 2976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength = textlength; 2977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(strsrch->textIter, text, textlength, status); 2978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 2979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 2980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->reset = TRUE; 2981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->breakIter != NULL) { 2983b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_setText(strsrch->search->breakIter, text, 2984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength, status); 2985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2992b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 2993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *length) 2994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *length = strsrch->search->textLength; 2997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->text; 2998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3002b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator, 3004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (collator == NULL) { 3008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3011c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ownCollator && (strsrch->collator != collator)) { 3014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close((UCollator *)strsrch->collator); 3015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ownCollator = FALSE; 3016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->collator = collator; 3018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->strength = ucol_getStrength(collator); 3019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ceMask = getMask(strsrch->strength); 3020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 3021b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_close(strsrch->search->internalBreakIter); 3022b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3023b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->text, strsrch->search->textLength, status); 3024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3026b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->toShift = 3027b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_SHIFTED; 3029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getVariableTop returns 0 3030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->variableTop = ucol_getVariableTop(collator, status); 3031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(strsrch, status); 3033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3034c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru /* free offset buffer to avoid memory leak before initializing. */ 3035b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 3036b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_init_collIterate(collator, strsrch->search->text, 3037b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->textLength, 303850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &(strsrch->textIter->iteratordata_), 303950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho status); 3040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->utilIter->iteratordata_.coll = collator; 3041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3044c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3045c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** are these calls needed? 3046c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** we call uprv_init_pce in initializePatternPCETable 3047c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** and the CEBuffer constructor... 3048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3049c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(strsrch->textIter); 3050c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(strsrch->utilIter); 3051c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (UCollator *)strsrch->collator; 3059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3063b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *pattern, 3065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternlength, 3066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch == NULL || pattern == NULL) { 3070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == -1) { 3074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength = u_strlen(pattern); 3075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == 0) { 3077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.text = pattern; 3081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.textLength = patternlength; 3082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(strsrch, status); 3083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3087b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UChar* U_EXPORT2 3088b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruusearch_getPattern(const UStringSearch *strsrch, 3089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *length) 3090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *length = strsrch->pattern.textLength; 3093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->pattern.text; 3094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// miscellanous methods -------------------------------------------------- 3099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3100b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3101b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 3102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, 0, status); 3106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return usearch_next(strsrch, status); 3108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3113b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 3115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position checked in usearch_setOffset 3120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, position, status); 3121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3122b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return usearch_next(strsrch, status); 3123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3127b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3128b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = FALSE; 3133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, strsrch->search->textLength, status); 3134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return usearch_previous(strsrch, status); 3136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3141b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 3143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = FALSE; 3147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position checked in usearch_setOffset 3148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, position, status); 3149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3150b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return usearch_previous(strsrch, status); 3151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3155b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 3157b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* If a direction switch is required, we'll count the number of ces till the 3158b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* beginning of the collation element iterator and iterate forwards that 3159b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* number of times. This is so that we get to the correct point within the 3160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* string to continue the search in. Imagine when we are in the middle of the 3161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* normalization buffer when the change in direction is request. arrrgghh.... 3162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* After searching the offset within the collation element iterator will be 3163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shifted to the start of the match. If a match is not found, the offset would 3164b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* have been set to the end of the text string in the collation element 3165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* iterator. 3166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Okay, here's my take on normalization buffer. The only time when there can 3167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* be 2 matches within the same normalization is when the pattern is consists 3168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of all accents. But since the offset returned is from the text string, we 3169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* should not confuse the caller by returning the second match within the 3170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* same normalization buffer. If we do, the 2 results will have the same match 3171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offsets, and that'll be confusing. I'll return the next match that doesn't 3172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* fall within the same normalization buffer. Note this does not affect the 3173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* results of matches spanning the text and the normalization buffer. 3174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The position to start searching is taken from the collation element 3175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* iterator. Callers of this API would have to set the offset in the collation 3176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* element iterator before using this method. 3177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 3178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3180b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 3181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 3182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note offset is either equivalent to the start of the previous match 3183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or is set by the user 3184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = usearch_getOffset(strsrch); 3185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearch *search = strsrch->search; 3186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->reset = FALSE; 3187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = search->textLength; 3188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isForwardSearching) { 3189c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 3190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset == textlength 3191b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru || (!search->isOverlap && 3192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (offset + strsrch->pattern.defaultShiftSize > textlength || 3193b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (search->matchedIndex != USEARCH_DONE && 3194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset + search->matchedLength >= textlength)))) { 3195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not enough characters to match 3196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3197b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return USEARCH_DONE; 3198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3199c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3200c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset == textlength || 3201c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (! search->isOverlap && 3202c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (search->matchedIndex != USEARCH_DONE && 3203c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offset + search->matchedLength > textlength))) { 3204c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // not enough characters to match 3205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 3206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return USEARCH_DONE; 3207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3211b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // switching direction. 3212b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if matchedIndex == USEARCH_DONE, it means that either a 3213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // setOffset has been called or that previous ran off the text 3214b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // string. the iterator would have been set to offset 0 if a 3215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match is not found. 3216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = TRUE; 3217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex != USEARCH_DONE) { 3218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there's no need to set the collation element iterator 3219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the next call to next will set the offset. 3220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.CELength == 0) { 3226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == USEARCH_DONE) { 3227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = offset; 3228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { // moves by codepoints 3230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_FWD_1(search->text, search->matchedIndex, textlength); 3231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3232b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedLength = 0; 3234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, search->matchedIndex); 3235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == textlength) { 3237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = USEARCH_DONE; 3238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedLength > 0) { 3242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if matchlength is 0 we are at the start of the iteration 3243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isOverlap) { 3244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, offset + 1, status); 3245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3247b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_setOffset(strsrch->textIter, 3248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset + search->matchedLength, status); 3249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for boundary check purposes. this will ensure that the 3253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // next match will not preceed the current offset 3254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note search->matchedIndex will always be set to something 3255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the code 3256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = offset - 1; 3257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isCanonicalMatch) { 3260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // can't use exact here since extra accents are allowed. 3261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handleNextCanonical(strsrch, status); 3262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handleNextExact(strsrch, status); 3265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 3269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !BOYER_MOORE 3273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (search->matchedIndex == USEARCH_DONE) { 3274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, search->textLength, status); 3275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 3276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 3290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset; 3291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearch *search = strsrch->search; 3292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->reset) { 3293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = search->textLength; 3294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = FALSE; 3295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->reset = FALSE; 3296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, offset); 3297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = usearch_getOffset(strsrch); 3300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3301b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchedindex = search->matchedIndex; 3303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isForwardSearching == TRUE) { 3304b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // switching direction. 3305b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if matchedIndex == USEARCH_DONE, it means that either a 3306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // setOffset has been called or that next ran off the text 3307b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // string. the iterator would have been set to offset textLength if 3308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a match is not found. 3309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = FALSE; 3310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (matchedindex != USEARCH_DONE) { 3311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return matchedindex; 3312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3315c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 3316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset == 0 || matchedindex == 0 || 3317b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (!search->isOverlap && 3318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (offset < strsrch->pattern.defaultShiftSize || 3319b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (matchedindex != USEARCH_DONE && 3320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru matchedindex < strsrch->pattern.defaultShiftSize)))) { 3321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not enough characters to match 3322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3323b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return USEARCH_DONE; 3324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3325c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3326c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Could check pattern length, but the 3327c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // linear search will do the right thing 3328c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset == 0 || matchedindex == 0) { 3329c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 3330c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return USEARCH_DONE; 3331c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3332c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.CELength == 0) { 3337b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru search->matchedIndex = 3338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (matchedindex == USEARCH_DONE ? offset : matchedindex); 3339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == 0) { 3340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { // move by codepoints 3344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UTF_BACK_1(search->text, 0, search->matchedIndex); 3345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, search->matchedIndex); 3346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedLength = 0; 3348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isCanonicalMatch) { 3352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // can't use exact here since extra accents are allowed. 3353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handlePreviousCanonical(strsrch, status); 3354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handlePreviousExact(strsrch, status); 3358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 3363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3365b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3373b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3376b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru /* 3377b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru reset is setting the attributes that are already in 3378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru string search, hence all attributes in the collator should 3379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru be retrieved without any problems 3380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 3381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool sameCollAttribute = TRUE; 3384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ceMask; 3385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool shift; 3386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t varTop; 3387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3388b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // **** hack to deal w/ how processed CEs encode quaternary **** 3389b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3390b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3391b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3392b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru sameCollAttribute = FALSE; 3393b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 3394b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->strength = ucol_getStrength(strsrch->collator); 3396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceMask = getMask(strsrch->strength); 3397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ceMask != ceMask) { 3398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ceMask = ceMask; 3399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3401b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3403b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status) == UCOL_SHIFTED; 3405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->toShift != shift) { 3406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->toShift = shift; 3407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getVariableTop returns 0 3411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru varTop = ucol_getVariableTop(strsrch->collator, &status); 3412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->variableTop != varTop) { 3413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->variableTop = varTop; 3414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!sameCollAttribute) { 3417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(strsrch, &status); 3418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3419c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru /* free offset buffer to avoid memory leak before initializing. */ 3420b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 3421b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_init_collIterate(strsrch->collator, strsrch->search->text, 3422b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->textLength, 342350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &(strsrch->textIter->iteratordata_), 342450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &status); 3425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 3426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 3427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isOverlap = FALSE; 3428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isCanonicalMatch = FALSE; 342950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = 0; 3430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->reset = TRUE; 3432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3435c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3436c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// CEI Collation Element + source text index. 3437c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// These structs are kept in the circular buffer. 3438c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3439c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustruct CEI { 3440c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t ce; 3441c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lowIndex; 3442c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t highIndex; 3443c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}; 3444c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3445c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_BEGIN 3446c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3447c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3448c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3449c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// CEBuffer A circular buffer of CEs from the text being searched. 3450c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define DEFAULT_CEBUFFER_SIZE 96 3452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define CEBUFFER_EXTRA 32 3453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Some typical max values to make buffer size more reasonable for asymmetric search. 3454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// #8694 is for a better long-term solution to allocation of this buffer. 3455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3458c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustruct CEBuffer { 3459c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3460c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEI *buf; 3461c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t bufSize; 3462c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t firstIx; 3463c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t limitIx; 3464c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UCollationElements *ceIter; 3465c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UStringSearch *strSearch; 3466c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3468c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3469c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEBuffer(UStringSearch *ss, UErrorCode *status); 3470c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ~CEBuffer(); 3471c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const CEI *get(int32_t index); 3472c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const CEI *getPrevious(int32_t index); 3473c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}; 3474c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3475c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3476c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruCEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { 3477c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf = defBuf; 3478c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strSearch = ss; 3479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA; 3480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (ss->search->elementComparisonType != 0) { 3481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UChar * patText = ss->pattern.text; 3482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (patText) { 3483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UChar * patTextLimit = patText + ss->pattern.textLength; 3484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ( patText < patTextLimit ) { 3485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar c = *patText++; 3486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (MIGHT_BE_JAMO_L(c)) { 3487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 3489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No check for surrogates, we might allocate slightly more buffer than necessary. 3490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3495c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ceIter = ss->textIter; 3496c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx = 0; 3497c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx = 0; 3498c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3499c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(ceIter); 3500c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3501c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3502c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3503c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (buf == NULL) { 3504c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 3505c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3506c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3507c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3508c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3509c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// TODO: add a reset or init function so that allocated 3510c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// buffers can be retained & reused. 3511c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3512c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruCEBuffer::~CEBuffer() { 3513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (buf != defBuf) { 3514c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(buf); 3515c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3516c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3517c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3518c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3519c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Get the CE with the specified index. 3520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Index must be in the range 3521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// n-history_size < index < n+1 3522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// where n is the largest index to have been fetched by some previous call to this function. 3523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruconst CEI *CEBuffer::get(int32_t index) { 3526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int i = index % bufSize; 3527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=firstIx && index<limitIx) { 3529c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The request was for an entry already in our buffer. 3530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Just return it. 3531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caller is requesting a new, never accessed before, CE. 3535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Verify that it is the next one in sequence, which is all 3536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // that is allowed. 3537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index != limitIx) { 3538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 3539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Manage the circular CE buffer indexing 3544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx++; 3545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (limitIx - firstIx >= bufSize) { 3547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The buffer is full, knock out the lowest-indexed entry. 3548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx++; 3549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3550c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 3554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Get the CE with the specified index. 3559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Index must be in the range 3560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// n-history_size < index < n+1 3561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// where n is the largest index to have been fetched by some previous call to this function. 3562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruconst CEI *CEBuffer::getPrevious(int32_t index) { 3565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int i = index % bufSize; 3566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=firstIx && index<limitIx) { 3568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The request was for an entry already in our buffer. 3569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Just return it. 3570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3571c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3572c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3573c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caller is requesting a new, never accessed before, CE. 3574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Verify that it is the next one in sequence, which is all 3575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // that is allowed. 3576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index != limitIx) { 3577c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 3578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3579c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3580c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3581c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3582c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Manage the circular CE buffer indexing 3583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx++; 3584c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3585c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (limitIx - firstIx >= bufSize) { 3586c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The buffer is full, knock out the lowest-indexed entry. 3587c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx++; 3588c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3589c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3591c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3592c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 3593c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3595c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3596c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3597c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_END 3598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3599c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3600c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// #define USEARCH_DEBUG 3601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3602c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 3603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include <stdio.h> 3604c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include <stdlib.h> 3605c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3606c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3607c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/* 3608c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Find the next break boundary after startIndex. If the UStringSearch object 3609c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * has an external break iterator, use that. Otherwise use the internal character 3610c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * break iterator. 3611c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 3612c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3613c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3614c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const UChar *text = strsrch->search->text; 3615c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textLen = strsrch->search->textLength; 3616b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3617c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(startIndex>=0); 3618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(startIndex<=textLen); 3619b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (startIndex >= textLen) { 3621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c; 3625c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t i = startIndex; 3626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_NEXT(text, i, textLen, c); 3627b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If we are on a control character, stop without looking for combining marks. 3629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Control characters do not combine. 3630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return i; 3633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3634b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The initial character was not a control, and can thus accept trailing 3636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining characters. Advance over however many of them there are. 3637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t indexOfLastCharChecked; 3638c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 3639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru indexOfLastCharChecked = i; 3640c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (i>=textLen) { 3641c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3642c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3643c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_NEXT(text, i, textLen, c); 3644c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3646c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3647c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3648c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3649c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return indexOfLastCharChecked; 3650c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#elif !UCONFIG_NO_BREAK_ITERATION 3651c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3652c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3653c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator == NULL) { 3654c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakiterator = strsrch->search->internalBreakIter; 3655c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3656c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator != NULL) { 3658b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return ubrk_following(breakiterator, startIndex); 3659c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3660c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3661c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3663c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** or should we use the original code? **** 3664c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3665c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3666c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3667c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3668c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3669c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/* 3670c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Returns TRUE if index is on a break boundary. If the UStringSearch 3671c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * has an external break iterator, test using that, otherwise test 3672c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * using the internal character break iterator. 3673c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 3674c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3675c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3676c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const UChar *text = strsrch->search->text; 3677c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textLen = strsrch->search->textLength; 3678b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(index>=0); 3680c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(index<=textLen); 3681b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3682c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=textLen || index<=0) { 3683b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3684c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3685b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the character at the current index is not a GRAPHEME_EXTEND 3687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // then we can not be within a combining sequence. 3688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c; 3689c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_GET(text, 0, index, textLen, c); 3690c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3691c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3693c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3694b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3695c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We are at a combining mark. If the preceding character is anything 3696c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // except a CONTROL, CR or LF, we are in a combining sequence. 3697b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U16_PREV(text, 0, index, c); 3698c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3699b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3700b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return !combining; 3701c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#elif !UCONFIG_NO_BREAK_ITERATION 3702c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3703c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator == NULL) { 3705c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakiterator = strsrch->search->internalBreakIter; 3706c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3707c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3708b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** or use the original code? **** 3711b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3712c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3713b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 3714c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3715c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3716c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3717c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 3718c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 3719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3720c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3721c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator != NULL) { 3722c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startindex = ubrk_first(breakiterator); 3723c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t endindex = ubrk_last(breakiterator); 3724b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3725c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // out-of-range indexes are never boundary positions 3726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (start < startindex || start > endindex || 3727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru end < startindex || end > endindex) { 3728c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3729c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3730c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3731b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ubrk_isBoundary(breakiterator, start) && 3732c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ubrk_isBoundary(breakiterator, end); 3733c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3734c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3735c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 3737c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3738c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3739c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 374050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehotypedef enum { 374150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH = -1, 374250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_NO_MATCH = 0, 374350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_SKIP_TARG, 374450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_SKIP_PATN 374550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} UCompareCEsResult; 374650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#define U_CE_LEVEL2_BASE 0x00000005 374750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#define U_CE_LEVEL3_BASE 0x00050000 374850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 374950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 375050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (targCE == patCE) { 375150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_MATCH; 375250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 375350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (compareType == 0) { 375450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_NO_MATCH; 375550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 375650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 375750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t targCEshifted = targCE >> 32; 375850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t patCEshifted = patCE >> 32; 375950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t mask; 376050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 376150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0xFFFF0000; 376250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev1 = (int32_t)(targCEshifted & mask); 376350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev1 = (int32_t)(patCEshifted & mask); 376450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev1 != patLev1 ) { 376550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev1 == 0 ) { 376650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_TARG; 376750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 376850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 376950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_PATN; 377050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 377150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_NO_MATCH; 377250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 377350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 377450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0x0000FFFF; 377550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev2 = (int32_t)(targCEshifted & mask); 377650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev2 = (int32_t)(patCEshifted & mask); 377750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev2 != patLev2 ) { 377850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev2 == 0 ) { 377950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_TARG; 378050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 378150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 378250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_PATN; 378350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 378450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 378550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH: U_CE_NO_MATCH; 378650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 378750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 378850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0xFFFF0000; 378950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev3 = (int32_t)(targCE & mask); 379050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev3 = (int32_t)(patCE & mask); 379150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev3 != patLev3 ) { 379250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 379350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH: U_CE_NO_MATCH; 379450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 379550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 379650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_MATCH; 379750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 379850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 379950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if BOYER_MOORE 380050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 380150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 3802b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3803c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3804c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startIdx, 3805c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchStart, 3806c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchLimit, 3807b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 3808c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 3809c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 3810c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3811c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3812c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3813c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: reject search patterns beginning with a combining char. 3814c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3815c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 3816c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 3817c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Pattern CEs\n"); 3818c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 3819c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf(" %8x", strsrch->pattern.CE[ii]); 3820c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3821c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n"); 3822c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3823b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3824c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3825c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Input parameter sanity check. 3826c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: should input indicies clip to the text length 3827c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // in the same way that UText does. 3828b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if(strsrch->pattern.CELength == 0 || 3829c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx < 0 || 3830c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx > strsrch->search->textLength || 3831c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->pattern.CE == NULL) { 3832c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3833c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3834c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3835c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3836c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (strsrch->pattern.PCE == NULL) { 3837c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru initializePatternPCETable(strsrch, status); 3838c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3839c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3840c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, startIdx, status); 3841c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEBuffer ceb(strsrch, status); 3842c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3843b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3844b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t targetIx = 0; 384550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *targetCEI = NULL; 3846c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t patIx; 3847c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool found; 3848c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3849c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mStart = -1; 3850c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mLimit = -1; 3851c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t minLimit; 3852c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t maxLimit; 3853b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3854b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3855b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3856c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Outer loop moves over match starting positions in the 3857c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // target CE space. 385850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Here we see the target as a sequence of collation elements, resulting from the following: 385950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 386050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // (for example, digraphs such as IJ may be broken into two characters). 386150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 386250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 386350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 386450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 386550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // then the CE is deleted, so the following code sees only CEs that are relevant. 386650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 386750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 386850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 386950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 3870c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for(targetIx=0; ; targetIx++) 3871c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 3872c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = TRUE; 3873c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Inner loop checks for a match beginning at each 3874c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // position from the outer loop. 387550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targetIxOffset = 0; 387650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t patCE = 0; 3877b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3878b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (compared to the last CE fetched for the previous targetIx value) as we need to go 3879b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3880b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *firstCEI = ceb.get(targetIx); 3881b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (firstCEI == NULL) { 3882b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *status = U_INTERNAL_PROGRAM_ERROR; 3883b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 3884b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 3885b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3886b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 3887b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) { 388850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patCE = strsrch->pattern.PCE[patIx]; 388950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3890c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compare CE from target string with CE from the pattern. 389150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3892c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which will fail the compare, below. 389350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 389450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_NO_MATCH ) { 3895c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 3896c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 389750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( ceMatch > U_CE_NO_MATCH ) { 389850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_SKIP_TARG ) { 389950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same patCE, next targCE 390050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patIx--; 390150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset++; 390250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { // ceMatch == U_CE_SKIP_PATN 390350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same targCE, next patCE 390450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset--; 390550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3906c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3907c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 390850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far 3909c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 391050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3911c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at this targetIx. Try again at the next. 3912c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 3913c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3914c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3915c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!found) { 3916c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at all, we have run off the end of the target text. 3917c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3918c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3919c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3920c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3921c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have found a match in CE space. 3922c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Now determine the bounds in string index space. 3923c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There still is a chance of match failure if the CE range not correspond to 3924c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an acceptable character range. 3925c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 392650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3927c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3928c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = firstCEI->lowIndex; 3929c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru minLimit = lastCEI->lowIndex; 3930c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3931c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the CE following the match. If it is UCOL_NULLORDER the match 3932c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // extended to the end of input, and the match is good. 3933c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3934c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the high and low indices of the CE following the match. If 3935c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // they are the same it means one of two things: 3936c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 1. The match extended to the last CE from the target text, which is OK, or 3937c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2. The last CE that was part of the match is in an expansion that extends 3938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to the first CE after the match. In this case, we reject the match. 3939b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *nextCEI = 0; 394050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (strsrch->search->elementComparisonType == 0) { 3941b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho nextCEI = ceb.get(targetIx + targetIxOffset); 394250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxLimit = nextCEI->lowIndex; 394350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 394450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho found = FALSE; 394550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 394650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 394750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for ( ; ; ++targetIxOffset ) { 394850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nextCEI = ceb.get(targetIx + targetIxOffset); 394950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxLimit = nextCEI->lowIndex; 3950b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we are at the end of the target too, match succeeds 395150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 395250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 395350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 395450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // As long as the next CE has primary weight of 0, 395550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // it is part of the last target element matched by the pattern; 395650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // make sure it can be part of a match with the last patCE 395750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3958b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3959b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3960b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 3961b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 3962b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 396350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 396450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // target element, but it has non-zero primary weight => match fails 396550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3966b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = false; 3967b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 396850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Else the target CE is not part of an expansion of the last matched element, match succeeds 396950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 3970b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 397150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 397250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3973c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3974b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3975c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3976c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within a combining sequence. 3977c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This can happen if the pattern itself begins with a combining char, and 3978c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the match found combining marks in the target text that were attached 3979c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to something else. 3980c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This type of match should be rejected for not completely consuming a 3981c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining sequence. 3982b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mStart)) { 3983c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 3984c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3985c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3986c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within an Collation Element Expansion, 3987c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // meaning that the first char of the match is only partially matched. 3988b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // With exapnsions, the first CE will report the index of the source 3989c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // character, and all subsequent (expansions) CEs will report the source index of the 3990b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // _following_ character. 3991c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t secondIx = firstCEI->highIndex; 3992c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mStart == secondIx) { 3993c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 3994c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3995b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3996c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Advance the match end position to the first acceptable match boundary. 3997c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This advances the index over any combining charcters. 3998c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = maxLimit; 3999c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (minLimit < maxLimit) { 4000b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // When the last CE's low index is same with its high index, the CE is likely 4001b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // a part of expansion. In this case, the index is located just after the 4002b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // character corresponding to the CEs compared above. If the index is right 4003b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the break boundary, move the position to the next boundary will result 4004b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // incorrect match length when there are ignorable characters exist between 4005b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the position and the next character produces CE(s). See ticket#8482. 4006b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4007b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = minLimit; 4008b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4009b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4010b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (nba >= lastCEI->highIndex) { 4011b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = nba; 4012b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4013c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4014c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4015b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4016c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4017c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4018c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4019c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4020c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4021b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4022c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If advancing to the end of a combining sequence in character indexing space 4023b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // advanced us beyond the end of the match in CE space, reject this match. 4024c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mLimit > maxLimit) { 4025c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4026c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4027c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4028b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mLimit)) { 4029c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4030c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4031c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4032b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (! checkIdentical(strsrch, mStart, mLimit)) { 4033b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = FALSE; 4034b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 4035b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4036c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found) { 4037c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4038c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4039c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4040c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4041c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4042c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4043c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4044c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lastToPrint = ceb.limitIx+2; 4045c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4046c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4047c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n%s\n", found? "match found" : "no match"); 4049c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4050c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4051c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4052c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Done. Store back the match bounds to the caller. 4053c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4054c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found==FALSE) { 4055c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = -1; 4056c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = -1; 4057c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4058c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4059c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchStart != NULL) { 4060c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchStart= mStart; 4061c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4062c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4063c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchLimit != NULL) { 4064c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchLimit = mLimit; 4065c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4066c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4067c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return found; 4068c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4069c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4070c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4071c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startIdx, 4072c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchStart, 4073c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchLimit, 4074b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 4075c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 4076c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 4077c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4078c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4079c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4080c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: reject search patterns beginning with a combining char. 4081c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4082c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 4083c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4084c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Pattern CEs\n"); 4085c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 4086c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf(" %8x", strsrch->pattern.CE[ii]); 4087c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4088c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n"); 4089c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4090b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4091c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4092c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Input parameter sanity check. 4093c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: should input indicies clip to the text length 4094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // in the same way that UText does. 4095b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if(strsrch->pattern.CELength == 0 || 4096c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx < 0 || 4097c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx > strsrch->search->textLength || 4098c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->pattern.CE == NULL) { 4099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 4100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (strsrch->pattern.PCE == NULL) { 4104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru initializePatternPCETable(strsrch, status); 4105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEBuffer ceb(strsrch, status); 4108b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t targetIx = 0; 4109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru /* 4111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Pre-load the buffer with the CE's for the grapheme 4112c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * after our starting position so that we're sure that 4113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * we can look at the CE following the match when we 4114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * check the match boundaries. 4115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * 4116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * This will also pre-fetch the first CE that we'll 4117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * consider for the match. 4118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 4119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (startIdx < strsrch->search->textLength) { 4120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *bi = strsrch->search->internalBreakIter; 4121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t next = ubrk_following(bi, startIdx); 4122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, next, status); 4124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (targetIx = 0; ; targetIx += 1) { 4126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, startIdx, status); 4132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4133b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 413550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *targetCEI = NULL; 4136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t patIx; 4137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool found; 4138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t limitIx = targetIx; 4140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mStart = -1; 4141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mLimit = -1; 4142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t minLimit; 4143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t maxLimit; 4144b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4145b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Outer loop moves over match starting positions in the 4148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // target CE space. 414950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 415050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // But patIx is 0 at the beginning of the pattern and increases toward the end. 415150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 415250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // and the beginning of the base text. 4153c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for(targetIx = limitIx; ; targetIx += 1) 4154c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = TRUE; 4156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (compared to the last CE fetched for the previous targetIx value) as we need to go 4158b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *lastCEI = ceb.getPrevious(targetIx); 4160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (lastCEI == NULL) { 4161b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *status = U_INTERNAL_PROGRAM_ERROR; 4162b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4165c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Inner loop checks for a match beginning at each 4166c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // position from the outer loop. 416750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targetIxOffset = 0; 4168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { 4169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t patCE = strsrch->pattern.PCE[patIx]; 4170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 417150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset); 4172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compare CE from target string with CE from the pattern. 4173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which will fail the compare, below. 417550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 417650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_NO_MATCH ) { 4177c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4178c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 417950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( ceMatch > U_CE_NO_MATCH ) { 418050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_SKIP_TARG ) { 418150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same patCE, next targCE 418250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patIx++; 418350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset++; 418450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { // ceMatch == U_CE_SKIP_PATN 418550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same targCE, next patCE 418650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset--; 418750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 4188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4189c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4190c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 419150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4192c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at this targetIx. Try again at the next. 4193c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 4194c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4195c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4196c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!found) { 4197c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at all, we have run off the end of the target text. 4198c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4199c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4200c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4201c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4202c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have found a match in CE space. 4203c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Now determine the bounds in string index space. 4204c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There still is a chance of match failure if the CE range not correspond to 4205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an acceptable character range. 4206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 420750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset); 4208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = firstCEI->lowIndex; 4209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within a combining sequence. 4211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This can happen if the pattern itself begins with a combining char, and 4212c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the match found combining marks in the target text that were attached 4213c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to something else. 4214c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This type of match should be rejected for not completely consuming a 4215c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining sequence. 4216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mStart)) { 4217c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4218c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4219c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4220c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the high index of the first CE in the match. If it's the same as the 4221c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // low index, the first CE in the match is in the middle of an expansion. 4222c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mStart == firstCEI->highIndex) { 4223c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4224c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4225b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4226c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho minLimit = lastCEI->lowIndex; 4228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (targetIx > 0) { 4230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look at the CE following the match. If it is UCOL_NULLORDER the match 4231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // extended to the end of input, and the match is good. 4232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look at the high and low indices of the CE following the match. If 4234b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // they are the same it means one of two things: 4235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 1. The match extended to the last CE from the target text, which is OK, or 4236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 2. The last CE that was part of the match is in an expansion that extends 4237b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // to the first CE after the match. In this case, we reject the match. 4238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4240b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = maxLimit = nextCEI->lowIndex; 4245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Advance the match end position to the first acceptable match boundary. 4247b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // This advances the index over any combining charcters. 4248b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (minLimit < maxLimit) { 4249b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4250b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (nba >= lastCEI->highIndex) { 4252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = nba; 4253b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4254c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4256b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If advancing to the end of a combining sequence in character indexing space 4257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // advanced us beyond the end of the match in CE space, reject this match. 4258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (mLimit > maxLimit) { 4259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4260b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4261b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4262b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Make sure the end of the match is on a break boundary 4263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mLimit)) { 4264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4266b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No non-ignorable CEs after this point. 4269b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The maximum position is detected by boundary after 4270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the last non-ignorable CE. Combining sequence 4271b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // across the start index will be truncated. 4272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4275b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4280c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4281b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4283b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (! checkIdentical(strsrch, mStart, mLimit)) { 4284b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = FALSE; 4285b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 4286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found) { 4288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4289c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4290c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4291c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4292c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4293c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4294c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4295c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lastToPrint = ceb.limitIx+2; 4296c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4297c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4298c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4299c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n%s\n", found? "match found" : "no match"); 4300c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4301c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4302c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4303c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Done. Store back the match bounds to the caller. 4304c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4305c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found==FALSE) { 4306c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = -1; 4307c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = -1; 4308c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4309c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4310c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchStart != NULL) { 4311c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchStart= mStart; 4312c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4313c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4314c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchLimit != NULL) { 4315c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchLimit = mLimit; 4316c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4317c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return found; 4319c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4320c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal use methods declared in usrchimp.h ----------------------------- 4322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4330c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 4333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 4334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 4335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status used in setting coleiter offset, since offset is checked in 4338b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // shiftForward before setting the coleiter offset, status never 4339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a failure 4340b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset <= textlength) 4343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t patternceindex = patterncelength - 1; 4345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastce = UCOL_NULLORDER; 4348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finding the last pattern ce match, imagine composite characters 4353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for example: search for pattern A in text \u00C0 4354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we'll have to skip \u0300 the grave first before we get to A 4355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4361b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is for the text \u0315\u0300 that requires 4363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalization and pattern \u0300, where \u0315 is ignorable 4364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastce = targetce; 4368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 436950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[patternceindex]) { 4371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //targetce = lastce; 4382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex > 0) { 4384b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lastce = targetce; 4385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 439650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4397b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4399b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = lastce; 4401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4406b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, lastce, 4407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at loop. 4409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = patterncelength; 4410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkNextExactMatch(strsrch, &textoffset, status)) { 4414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in ucol_setOffset 4415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4421c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4422c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textOffset = ucol_getOffset(strsrch->textIter); 4423c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4424c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4425c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4427c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4428c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4429c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4430c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4431c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4432c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4433c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4434c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4444c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 4447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 4448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 4449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4450b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool hasPatternAccents = 4451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4452b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4453b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4457b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset <= textlength) 4459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = patterncelength - 1; 4461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastce = UCOL_NULLORDER; 4464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finding the last pattern ce match, imagine composite characters 4469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for example: search for pattern A in text \u00C0 4470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we'll have to skip \u0300 the grave first before we get to A 4471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastce = targetce; 4479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 448050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[patternceindex]) { 4482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4491b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex > 0) { 4493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 450450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4505b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // initializing the rearranged accent array 4509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (hasPatternAccents && !found) { 4510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = doNextCanonicalMatch(strsrch, textoffset, status); 4516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4522b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, lastce, 4523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at loop 4525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = patterncelength; 4526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4528b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textOffset = ucol_getOffset(strsrch->textIter); 4538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 4562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 4563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // shifting it check for setting offset 4566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if setOffset is called previously or there was no previous match, we 4567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // leave the offset as it is. 4568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->matchedIndex != USEARCH_DONE) { 4569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = strsrch->search->matchedIndex; 4570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4571b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4572b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4574b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset >= 0) 4576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = 1; 4578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = UCOL_NULLORDER; 4581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 4583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // finding the first pattern ce match, imagine composite 4587b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // characters. for example: search for pattern \u0300 in text 4588b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // \u00C0, we'll have to skip A first before we get to 4589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // \u0300 the grave accent 4590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru firstce = targetce; 4598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4601b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 460250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[0]) { 4604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checking for accents in composite character 4609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //targetce = firstce; 4615b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && (patternceindex < patterncelength)) { 4617b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho firstce = targetce; 4618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 462850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4629b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex ++; 4631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4632b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = firstce; 4634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4639b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4640b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, targetce, 4641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = 0; 4643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4645b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4653c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4654b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t textOffset; 4655b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4656b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->isOverlap) { 4657b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->matchedIndex != USEARCH_DONE) { 4658b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4659b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4660b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // move the start position at the end of possible match 4661b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho initializePatternPCETable(strsrch, status); 4662b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 4663b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); 4664b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (pce == UCOL_PROCESSED_NULLORDER) { 4665b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the end of the text 4666b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4668b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4669b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(*status)) { 4670b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setMatchNotFound(strsrch); 4671b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return FALSE; 4672b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4673b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4674b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4675b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4676b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4677b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4678b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4680c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4681c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4682c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4684c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4685c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4689c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4690c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4693b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 4695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4701c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *patternce = strsrch->pattern.CE; 4704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patterncelength = strsrch->pattern.CELength; 4705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4706b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool hasPatternAccents = 4707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4708b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // shifting it check for setting offset 4710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if setOffset is called previously or there was no previous match, we 4711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // leave the offset as it is. 4712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->matchedIndex != USEARCH_DONE) { 4713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = strsrch->search->matchedIndex; 4714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4715b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4716b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4720b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset >= 0) 4722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = 1; 4724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = UCOL_NULLORDER; 4727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4730b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // finding the first pattern ce match, imagine composite 4731b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // characters. for example: search for pattern \u0300 in text 4732b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // \u00C0, we'll have to skip A first before we get to 4733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // \u0300 the grave accent 4734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru firstce = targetce; 4742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4743b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 474450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[0]) { 4746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checking for accents in composite character 4752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = firstce; 4758b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex < patterncelength) { 4760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 477050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4771b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex ++; 4773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // initializing the rearranged accent array 4776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (hasPatternAccents && !found) { 4777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4789b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, targetce, 4790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = 0; 4792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4802c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4803b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t textOffset; 4804b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4805b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->isOverlap) { 4806b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->matchedIndex != USEARCH_DONE) { 4807b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4808b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4809b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // move the start position at the end of possible match 4810b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho initializePatternPCETable(strsrch, status); 4811b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 4812b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); 4813b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (pce == UCOL_PROCESSED_NULLORDER) { 4814b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the end of the text 4815b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4816b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4817b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4818b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(*status)) { 4819b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setMatchNotFound(strsrch); 4820b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return FALSE; 4821b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4822b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4823b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4824b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4825b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4826b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4827b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4828c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4829c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4830c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4831c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4832c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4833c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4834c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4835c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4836c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4837c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4838c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4839c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_COLLATION */ 4843