1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 3c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert* Copyright (C) 2001-2015 IBM and others. All rights reserved. 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 07/02/2001 synwee Creation. 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/usearch.h" 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ustring.h" 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 1783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h" 1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "normalizer2impl.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "usrchimp.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucln_in.h" 22c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uassert.h" 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "ustr_imp.h" 24c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 25c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_USE 26c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 27c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// don't use Boyer-Moore 2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 29c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#define BOYER_MOORE 0 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal definition --------------------------------------------------- 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define LAST_BYTE_MASK_ 0xFF 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define SECOND_LAST_BYTE_SHIFT_ 8 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define SUPPLEMENTARY_MIN_VALUE_ 0x10000 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Corneliusstatic const Normalizer2Impl *g_nfcImpl = NULL; 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal methods ------------------------------------------------- 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Fast collation element iterator setOffset. 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This function does not check for bounds. 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param offset to set 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 47b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline void setColEIterOffset(UCollationElements *elems, 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset) 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: Not "fast" any more after the 2013 collation rewrite. 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not want to expose more internals than necessary. 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode status = U_ZERO_ERROR; 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_setOffset(elems, offset, &status); 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Getting the mask for collation strength 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strength collation strength 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return collation element mask 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 63b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint32_t getMask(UCollationStrength strength) 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 65b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru switch (strength) 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case UCOL_PRIMARY: 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return UCOL_PRIMARYORDERMASK; 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case UCOL_SECONDARY: 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 72b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_PRIMARYORDERMASK; 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 78c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert* @param ce 32-bit collation element 79c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert* @return hash code 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 82c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertinline int hashFromCE32(uint32_t ce) 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 84c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int hc = (int)( 85c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert ((((((ce >> 24) * 37) + 86c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert (ce >> 16)) * 37) + 87c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert (ce >> 8)) * 37) + 88c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert ce); 89c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert hc %= MAX_TABLE_SIZE_; 90c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (hc < 0) { 91c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert hc += MAX_TABLE_SIZE_; 92c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 93c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return hc; 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_BEGIN 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UBool U_CALLCONV 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruusearch_cleanup(void) { 9983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius g_nfcImpl = NULL; 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_END 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing the fcd tables. 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 107b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 111b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void initializeFCD(UErrorCode *status) 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 11383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if (g_nfcImpl == NULL) { 11483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the fcd value for a character at the argument index. 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This method takes into accounts of the supplementary characters. 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param str UTF16 string where character for fcd retrieval resides 123b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param offset position of the character whose fcd is to be retrieved, to be 124b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* overwritten with the next character position, taking 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* surrogate characters into consideration. 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strlength length of the argument string 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return fcd value 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 130b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruuint16_t getFCD(const UChar *str, int32_t *offset, 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t strlength) 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 133b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *temp = str + *offset; 13483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 135b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *offset = (int32_t)(temp - str); 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 140b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the modified collation elements taking into account the collation 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* attributes 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 143b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param sourcece 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the modified collation element 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note for tertiary we can't use the collator->tertiaryMask, that 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is a preprocessed mask that takes into account case options. since 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only concerned with exact matches, we don't need that. 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece &= strsrch->ceMask; 153b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->toShift) { 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alternate handling here, since only the 16 most significant digits 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is only used, we can safely do a compare without masking 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if the ce is a variable, we mask and get only the primary values 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // no shifting to quartenary is required since all primary values 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // less than variabletop will need to be masked off anyway. 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->variableTop > sourcece) { 161b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch->strength >= UCOL_QUATERNARY) { 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece &= UCOL_PRIMARYORDERMASK; 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 164b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru else { 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sourcece = UCOL_IGNORABLE; 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru sourcece = 0xFFFF; 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return sourcece; 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 175b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/** 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Allocate a memory and returns NULL if it failed. 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param size to allocate 179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return newly allocated array, NULL otherwise 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 184b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void * allocateMemory(uint32_t size, UErrorCode *status) 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t *result = (uint32_t *)uprv_malloc(size); 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL) { 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Adds a uint32_t value to a destination array. 195b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* manually deallocate the newly allocated array. 197b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 198b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size destinationlength. 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destination target array 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param offset destination offset to add value 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destinationlength target array size, return value for the new size 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param value to be added 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param increments incremental size expected 205b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 210b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t * addTouint32_tArray(int32_t *destination, 211b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t offset, 212b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t *destinationlength, 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t value, 214b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t increments, 215b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t newlength = *destinationlength; 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset + 1 == newlength) { 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru newlength += increments; 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *temp = (int32_t *)allocateMemory( 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(int32_t) * newlength, status); 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = newlength; 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination = temp; 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination[offset] = value; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return destination; 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 234c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Adds a uint64_t value to a destination array. 235b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 236c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* manually deallocate the newly allocated array. 237b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 238b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 239c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* size destinationlength. 240c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param destination target array 241c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param offset destination offset to add value 242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param destinationlength target array size, return value for the new size 243c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param value to be added 244c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param increments incremental size expected 245b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 246c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* method, status assumed to be success when passed in. 247c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 248c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/ 249c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic 250b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int64_t * addTouint64_tArray(int64_t *destination, 251b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t offset, 252b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t *destinationlength, 253c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint64_t value, 254b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t increments, 255b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 256c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 257c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t newlength = *destinationlength; 258c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset + 1 == newlength) { 259c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru newlength += increments; 260c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t *temp = (int64_t *)allocateMemory( 261c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru sizeof(int64_t) * newlength, status); 262b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 268c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *destinationlength = newlength; 269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru destination = temp; 270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru destination[offset] = value; 273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return destination; 275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing the ce table for a pattern. 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Stores non-ignorable collation keys. 280b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Table size will be estimated by the size of the pattern text. Table 281b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* expansion will be perform as we go along. Adding 1 to ensure that the table 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size definitely increases. 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success. 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 285b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 287b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return total number of expansions 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 290b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint16_t initializePatternCETable(UStringSearch *strsrch, 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 295f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *cetable = pattern->cesBuffer; 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t patternlength = pattern->textLength; 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 298b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == NULL) { 300b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru coleiter = ucol_openElements(strsrch->collator, pattern->text, 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength, status); 302b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // status will be checked in ucol_next(..) later and if it is an 303b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // returned. 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->utilIter = coleiter; 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_setText(coleiter, pattern->text, pattern->textLength, status); 30950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 31050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(*status)) { 31150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 313b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 314f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (pattern->ces != cetable && pattern->ces) { 315f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uprv_free(pattern->ces); 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 317b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t offset = 0; 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t result = 0; 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce; 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SUCCESS(*status)) { 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t newce = getCE(strsrch, ce); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (newce) { 326b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru newce, 328b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength - ucol_getOffset(coleiter) + 1, 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset ++; 334f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (cetable != temp && cetable != pattern->cesBuffer) { 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(cetable); 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cetable = temp; 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cetable[offset] = 0; 343f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius pattern->ces = cetable; 344f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius pattern->cesLength = offset; 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 350c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Initializing the pce table for a pattern. 351c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Stores non-ignorable collation keys. 352b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Table size will be estimated by the size of the pattern text. Table 353b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* expansion will be perform as we go along. Adding 1 to ensure that the table 354c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* size definitely increases. 355c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Internal method, status assumed to be a success. 356c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* @param strsrch string search data 357b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 358c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* method, status assumed to be success when passed in. 359b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return total number of expansions 360c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/ 361c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic 362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint16_t initializePatternPCETable(UStringSearch *strsrch, 363c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode *status) 364c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 365c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 366c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 367f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int64_t *pcetable = pattern->pcesBuffer; 368c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t patternlength = pattern->textLength; 369c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 370b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 371c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (coleiter == NULL) { 372b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru coleiter = ucol_openElements(strsrch->collator, pattern->text, 373c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru patternlength, status); 374b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // status will be checked in ucol_next(..) later and if it is an 375b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // returned. 377c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->utilIter = coleiter; 378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_setText(coleiter, pattern->text, pattern->textLength, status); 38050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 38150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(*status)) { 38250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 383c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 384b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 385f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (pattern->pces != pcetable && pattern->pces != NULL) { 386f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uprv_free(pattern->pces); 387c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 388b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 389c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint16_t offset = 0; 390c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint16_t result = 0; 391c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t pce; 392c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius icu::UCollationPCE iter(coleiter); 394c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 395c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** Should processed CEs be signed or unsigned? 396b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // ** (the rest of the code in this file seems to play fast-and-loose with 397c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 399c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_SUCCESS(*status)) { 400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 401c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pce, 402b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength - ucol_getOffset(coleiter) + 1, 403c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru status); 404c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 405c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 406c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 407c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 408c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 409c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offset += 1; 410c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 411f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (pcetable != temp && pcetable != pattern->pcesBuffer) { 412c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(pcetable); 413c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 414c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 415c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pcetable = temp; 416c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 417c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 418c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 419c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru pcetable[offset] = 0; 420f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius pattern->pces = pcetable; 421f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius pattern->pcesLength = offset; 422c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 423c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return result; 424c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 425c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializes the pattern struct. 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch UStringSearch data storage 430b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param status output error if any, caller to check status before calling 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* method, status assumed to be success when passed in. 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return expansionsize the total expansion size of the pattern 433b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/ 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 435b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (U_FAILURE(*status)) { return 0; } 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *patterntext = pattern->text; 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = pattern->textLength; 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t index = 0; 442b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Since the strength is primary, accents are ignored in the pattern. 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->strength == UCOL_PRIMARY) { 445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasPrefixAccents = 0; 446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasSuffixAccents = 0; 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho SECOND_LAST_BYTE_SHIFT_; 450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho index = length; 45183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(patterntext, 0, index); 452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho LAST_BYTE_MASK_; 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 455c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 456c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ** HACK ** 457f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.pces != NULL) { 458f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 459f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uprv_free(strsrch->pattern.pces); 460c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 461c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 462f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strsrch->pattern.pces = NULL; 463c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 464c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since intializePattern is an internal method status is a success. 466b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return initializePatternCETable(strsrch, status); 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Initializing shift tables, with the default values. 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If a corresponding default value is 0, the shift table is not set. 472b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param shift table for forwards shift 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param backshift table for backwards shift 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param cetable table containing pattern ce 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param cesize size of the pattern ces 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param expansionsize total size of the expansions 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param defaultforward the default forward value 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param defaultbackward the default backward value 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 481b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void setShiftTable(int16_t shift[], int16_t backshift[], 482b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *cetable, int32_t cesize, 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t expansionsize, 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t defaultforward, 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t defaultbackward) 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 487b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // estimate the value to shift. to do that we estimate the smallest 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // number of characters to give the relevant ces, ie approximately 489b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // the number of ces minus their expansion, since expansions can come 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // from a character. 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count; 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift[count] = defaultforward; 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cesize --; // down to the last index 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < cesize; count ++) { 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // number of ces from right of array to the count 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int temp = defaultforward - count - 1; 499c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1; 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 501c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert shift[hashFromCE32(cetable[cesize])] = 1; 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for ignorables we just shift by one. see test examples. 503c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert shift[hashFromCE32(0)] = 1; 504b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru backshift[count] = defaultbackward; 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (count = cesize; count > 0; count --) { 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the original value count does not seem to work 510c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert backshift[hashFromCE32(cetable[count])] = count > expansionsize ? 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (int16_t)(count - expansionsize) : 1; 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 513c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert backshift[hashFromCE32(cetable[0])] = 1; 514c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert backshift[hashFromCE32(0)] = 1; 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Building of the pattern collation element list and the boyer moore strsrch 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* table. 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The canonical match will only be performed after the default match fails. 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* For both cases we need to remember the size of the composed and decomposed 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* versions of the string. Since the Boyer-Moore shift calculations shifts by 523b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* a number of characters in the text and tries to match the pattern from that 524b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* offset, the shift value can not be too large in case we miss some 525b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* characters. To choose a right shift size, we estimate the NFC form of the 526b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* and use its size as a shift guide. The NFC form should be the small 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* possible representation of the pattern. Anyways, we'll err on the smaller 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift size. Hence the calculation for minlength. 529b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Canonical match will be performed slightly differently. We'll split the 530b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* pattern into 3 parts, the prefix accents (PA), the middle string bounded by 531b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the first and last base character (MS), the ending accents (EA). Matches 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* will be done on MS first, and only when we match MS then some processing 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* will be required for the prefix and end accents in order to determine if 534b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* they match PA and EA. Hence the default shift values 535b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* for the canonical match will take the size of either end's accent into 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* consideration. Forwards search will take the end accents into consideration 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* for the default shift values and the backwards search will take the prefix 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accents into consideration. 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If pattern has no non-ignorable ce, we return a illegal argument error. 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch UStringSearch data storage 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status for output errors if it occurs, status is assumed to be a 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* success when it is passed in. 544b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/ 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 546b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void initialize(UStringSearch *strsrch, UErrorCode *status) 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 548b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int16_t expandlength = initializePattern(strsrch, status); 549f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &strsrch->pattern; 551f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t cesize = pattern->cesLength; 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 553b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int16_t minlength = cesize > expandlength 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ? (int16_t)cesize - expandlength : 1; 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pattern->defaultShiftSize = minlength; 556f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius setShiftTable(pattern->shift, pattern->backShift, pattern->ces, 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cesize, expandlength, minlength, minlength); 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.defaultShiftSize = 0; 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 565b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Check to make sure that the match length is at the end of the character by 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* using the breakiterator. 567b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param strsrch string search data 568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start target text start offset 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end target text end offset 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 572b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *end) 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiterator) { 578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t matchend = *end; 579b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //int32_t matchstart = *start; 580b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 581b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!ubrk_isBoundary(breakiterator, matchend)) { 582b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *end = ubrk_following(breakiterator, matchend); 583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 584b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 585b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /* Check the start of the matched text to make sure it doesn't have any accents 586b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * before it. This code may not be necessary and so it is commented out */ 587b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 588b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *start = ubrk_preceding(breakiterator, matchstart); 589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }*/ 590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 595b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Determine whether the target text in UStringSearch bounded by the offset 596b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* start and end is one or more whole units of text as 597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* determined by the breakiterator in UStringSearch. 598b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param strsrch string search data 599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start target text start offset 600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end target text end offset 601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 603b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end) 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //TODO: Add here. 609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiterator) { 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t startindex = ubrk_first(breakiterator); 611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t endindex = ubrk_last(breakiterator); 612b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // out-of-range indexes are never boundary positions 614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (start < startindex || start > endindex || 615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end < startindex || end > endindex) { 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 618b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // otherwise, we can use following() on the position before the 619b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // specified one and return true of the position we get back is the 620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // one the user specified 621b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool result = (start == startindex || 622b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_following(breakiterator, start - 1) == start) && 623b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (end == endindex || 624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ubrk_following(breakiterator, end - 1) == end); 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result) { 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // iterates the individual ces 627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 628b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text = strsrch->search->text + 629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start; 630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, text, end - start, &status); 632f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t count = 0; count < strsrch->pattern.cesLength; 633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++) { 634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 639f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nextce = ucol_next(coleiter, &status); 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ucol_getOffset(coleiter) == (end - start) 645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextce = ucol_next(coleiter, &status); 647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) == (end - start) 649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && nextce != UCOL_NULLORDER) { 650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra collation elements at the end of the match 651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 661b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the next base character offset if current offset is an accent, 662b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* or the current offset if the current character contains a base character. 663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accents the following base character will be returned 664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset 666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the next base character or the current offset 668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if the current character is contains a base character. 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 671b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextBaseOffset(const UChar *text, 672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength) 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset < textlength) { 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 678b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (temp < textlength) { 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = temp; 680b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((getFCD(text, &temp, textlength) >> 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru SECOND_LAST_BYTE_SHIFT_) == 0) { 682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textlength; 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the next base character offset depending on the string search pattern 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* data 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset, one offset away from the last character 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* to search for. 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return start index of the next base character or the current offset 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if the current character is contains a base character. 699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 701b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 705b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents && 706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset < textlength) { 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 70983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, temp); 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return getNextBaseOffset(text, textoffset, textlength); 712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Shifting the collation element iterator position forward to prepare for 719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a following match. If the last character is a unsafe character, we'll only 720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift by 1 to capture contractions, normalization etc. 721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success. 722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text strsrch string search data 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset start text position to do search 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param ce the text ce which failed the match. 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param patternceindex index of the ce within the pattern ce buffer which 726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* failed the match 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return final offset 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t shiftForward(UStringSearch *strsrch, 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce, 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex) 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UPattern *pattern = &(strsrch->pattern); 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER) { 737c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t shift = pattern->shift[hashFromCE32(ce)]; 738b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is to adjust for characters in the middle of the 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // substring for matching that failed. 740f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t adjust = pattern->cesLength - patternceindex; 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (adjust > 1 && shift >= adjust) { 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift -= adjust - 1; 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset += shift; 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset += pattern->defaultShiftSize; 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check for unsafe characters 752b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // * if it is the start or middle of a contraction: to be done after 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a initial match is found 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * thai or lao base consonant character: similar to contraction 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * high surrogate character: similar to contraction 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // * next character is a accent: shift to the next base character 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 759c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 762b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* sets match not found 763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 766b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline void setMatchNotFound(UStringSearch *strsrch) 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this method resets the match result regardless of the error status. 769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, 0); 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 779c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the offset to the next safe point in text. 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ie. not the middle of a contraction, swappable characters or supplementary 783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters. 784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param collator collation sata 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string to work with 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in string 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return offset to the next safe character 789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 791b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getNextSafeOffset(const UCollator *collator, 792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength) 795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; // first contraction character 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (result != textlength && ucol_unsafeCP(text[result], collator)) { 798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 800b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return result; 801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 803b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/** 804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This checks for accents in the potential match started with a . 805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* composite character. 806b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This is really painful... we have to check that composite character do not 807b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* have any extra accents. We have to normalize the potential match and find 808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the immediate decomposed character before the match. 809b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* The first composite character would have been taken care of by the fcd 810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checks in checkForwardExactMatch. 811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This is the slow path after the fcd of the first character and 812b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the last character has been checked by checkForwardExactMatch and we 813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* determine that the potential match has extra non-ignorable preceding 814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ces. 815b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be a success, caller should check status 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method 820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start index of the potential unfriendly composite character 822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end index of the potential unfriendly composite character 823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any. 824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if there is non-ignorable accents before at the beginning 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of the match, FALSE otherwise. 826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 830b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool result = FALSE; 834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = end - start; 836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = 0; 837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text + start; 838b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 83983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_FWD_1(text, offset, length); 840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only concerned with the first composite character 841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 842b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t safeoffset = getNextSafeOffset(strsrch->collator, 843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru text, 0, length); 844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (safeoffset != length) { 845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safeoffset ++; 846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *norm = NULL; 848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 849b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 850b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru buffer, INITIAL_ARRAY_SIZE_, 851b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status); 852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (size >= INITIAL_ARRAY_SIZE_) { 856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 858b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if allocation failed, status will be set to 859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checks for it. 861b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru size, status); 863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) && norm != NULL) { 864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(norm); 865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru norm = buffer; 870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, norm, size, status); 874f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uint32_t firstce = strsrch->pattern.ces[0]; 875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool ignorable = TRUE; 876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ce = UCOL_IGNORABLE; 877c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = ucol_getOffset(coleiter); 879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != firstce && ce != UCOL_IGNORABLE) { 880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ignorable = FALSE; 881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = ucol_next(coleiter, status); 883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 codepoint; 88583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_PREV(norm, 0, offset, codepoint); 886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = !ignorable && (u_getCombiningClass(codepoint) != 0); 887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (norm != buffer) { 889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(norm); 890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 898b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Used by exact matches, checks if there are accents before the match. 899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This is really painful... we have to check that composite characters at 900b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* the start of the matches have to not have any extra accents. 901b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We check the FCD of the character first, if it starts with an accent and 902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the first pattern ce does not match the first ce of the character, we bail. 903b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Otherwise we try normalizing the first composite 904b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* character and find the immediate decomposed character before the match to 905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* see if it is an non-ignorable accent. 906b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Now normalizing the first composite character is enough because we ensure 907b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* that when the match is passed in here with extra beginning ces, the 908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* first or last ce that match has to occur within the first character. 909b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 913b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param start offset 914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset 915b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return TRUE if there are accents on either side of the match, 916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* FALSE otherwise 917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 920b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we have been iterating forwards previously 926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ignorable = TRUE; 927f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t firstce = strsrch->pattern.ces[0]; 928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ce != firstce) { 935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_IGNORABLE) { 936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ignorable = FALSE; 937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, &status)); 939c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ignorable && inNormBuf(coleiter)) { 944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within normalization buffer, discontiguous handled here 945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within text 949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = start; 950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // original code 951b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accent = (getFCD(strsrch->search->text, &temp, 952b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // strsrch->search->textLength) 953b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // >> SECOND_LAST_BYTE_SHIFT_); 954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // however this code does not work well with VC7 .net in release mode. 955b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // maybe the inlines for getFCD combined with shifting has bugs in 956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // VC7. anyways this is a work around. 957b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool accent = getFCD(strsrch->search->text, &temp, 958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength) > 0xFF; 959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!accent) { 960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return checkExtraMatchAccents(strsrch, start, end, &status); 961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ignorable) { 963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (start > 0) { 966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = start; 96783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(strsrch->search->text, 0, temp); 968b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getFCD(strsrch->search->text, &temp, 969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength) & LAST_BYTE_MASK_) { 970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = ucol_previous(coleiter, &status); 972b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status) || 973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 979b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Used by exact matches, checks if there are accents bounding the match. 985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note this is the initial boundary check. If the potential match 986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* starts or ends with composite characters, the accents in those 987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters will be determined later. 988b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Not doing backwards iteration here, since discontiguos contraction for 989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* backwards collation element iterator, use up too many characters. 990b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* E.g. looking for \u030A ring in \u01FA A ring above and acute, 991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* should fail since there is a acute at the end of \u01FA 992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Note here that accents checking are slow and cautioned in the API docs. 993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of match 995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end end offset of the match 996b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return TRUE if there are accents on either side of the match, 997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* FALSE otherwise 998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1000b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1001b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 1002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents) { 1004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = end; 1006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 100783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, temp); 1008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1009f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t firstce = strsrch->pattern.ces[0]; 1010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1012b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t ce; 1013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, start); 1014c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1015c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 1; 1020f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius while (count < strsrch->pattern.cesLength) { 1021b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getCE(strsrch, ucol_next(coleiter, &status)) 1022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru == UCOL_IGNORABLE) { 1023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Thai can give an ignorable here. 1024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1031b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1032b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ce = ucol_next(coleiter, &status); 1033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1037b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ce = getCE(strsrch, ce); 1038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) <= end) { 1041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1051c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 1052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks if the offset runs out of the text string 1055b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param offset 1056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength of the text string 1057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if offset is out of bounds, FALSE otherwise 1058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return offset < 0 || offset > textlength; 1063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks for identical match 1067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of possible match 1069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of possible match 1070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if identical match is found 1071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1073b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1074b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end) 1075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->strength != UCOL_IDENTICAL) { 1077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 108050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Note: We could use Normalizer::compare() or similar, but for short strings 108150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // which may not be in FCD it might be faster to just NFD them. 108250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode status = U_ZERO_ERROR; 108350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString t2, p2; 108450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->nfd->normalize( 108550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 108650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->nfd->normalize( 108750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // return FALSE if NFD failed 108950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_SUCCESS(status) && t2 == p2; 1090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1092b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if BOYER_MOORE 1093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks to see if the match is repeated 1095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start new match start index 1097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end new match end index 1098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the the match is repeated, FALSE otherwise 1099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline UBool checkRepeatedMatch(UStringSearch *strsrch, 1102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 1103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end) 1104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastmatchindex = strsrch->search->matchedIndex; 1106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool result; 1107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastmatchindex == USEARCH_DONE) { 1108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 1111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start <= lastmatchindex; 1112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start >= lastmatchindex; 1115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!result && !strsrch->search->isOverlap) { 1117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isForwardSearching) { 1118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = start < lastmatchindex + strsrch->search->matchedLength; 1119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = end > lastmatchindex; 1122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the collation element iterator's current offset. 1129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 1130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param forwards flag TRUE if we are moving in th forwards direction 1131b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return current offset 1132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool forwards) 1136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(coleiter); 1138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // intricacies of the the backwards collation element iterator 1139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 1141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 1147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match ends with a partial contraction we fail. 1148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts too far off (because of backwards iteration) we try to 1149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* chip off the extra characters depending on whether a breakiterator has 1150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* been used. 1151b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, error assumed to be success, caller has to check status 1152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 1155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 1156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 1158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1161b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkNextExactContractionMatch(UStringSearch *strsrch, 1162b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 1163b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 1164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1167c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t temp = *start; 1168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1170b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either ends of the match contains potential 1171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 1172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The start contraction needs to be checked since ucol_previous dumps 1173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // all characters till the first safe character into the buffer. 1174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // *start + 1 is used to test for the unsafe characters instead of *start 1175b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // because ucol_prev takes all unsafe characters till the first safe 1176b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // character ie *start. so by testing *start + 1, we can estimate if 1177b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // excess prefix characters has been included in the potential search 1178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // results. 1179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1180b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (*start + 1 < textlength 1181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && ucol_unsafeCP(text[*start + 1], collator))) { 1182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionPrefix(coleiter); 1183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 1184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *start); 1185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 1186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce, caused by setOffset. 1187b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // since backward contraction/expansion may have extra ces if we 1188b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // are in the normalization buffer, hasAccentsBeforeMatch would 1189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have taken care of it. 1190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 1191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only looking for acute and ring \u030A and \u0301, we'll 1192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have to skip the first ce in the expansion buffer. 1193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_next(coleiter, status); 1194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 1198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 1202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1204f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 1205f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 1206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 1207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count < patterncelength) { 1208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count]) { 1217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*end) ++; 1218b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *end = getNextUStringSearchBaseOffset(strsrch, *end); 1219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1223b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 1224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 1229b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 1230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 1231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 1232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 1233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> exact matches has no extra accents 1234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matchesb 1235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 1236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 1237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 1238b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 1242b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated end offset of the match or the new start 1243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 1244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1248b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkNextExactMatch(UStringSearch *strsrch, 1249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *textoffset, UErrorCode *status) 1250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1252b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t start = getColElemIterOffset(coleiter, FALSE); 1253b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 1259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!isBreakUnit(strsrch, start, *textoffset) || 1260b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru checkRepeatedMatch(strsrch, start, *textoffset) || 1261b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, start, *textoffset) || 1263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) ++; 1266b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Add breakiterator boundary check for primary strength search. 1271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho checkBreakBoundary(strsrch, &start, textoffset); 1273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1274b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // totally match, we will get rid of the ending ignorables. 1276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = start; 1277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = *textoffset - start; 1278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1282b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Getting the previous base character offset, or the current offset if the 1283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* current character is a base character 1284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 1285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset one offset after the current character 1286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @return the offset of the next character after the base character or the first 1287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* composed character with accents 1288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1290b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getPreviousBaseOffset(const UChar *text, 1291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset > 0) { 1294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 1295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; 129683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, textoffset); 1297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 1298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t fcd = getFCD(text, &temp, result); 1299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fcd & LAST_BYTE_MASK_) { 1301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset == 0) { 1306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 1307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Getting the indexes of the accents that are not blocked in the argument 1315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* accent array 1316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param accents array of accents in nfd terminated by a 0. 1317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param accentsindex array of indexes of the accents that are not blocked 1318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t index = 0; 1323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length = u_strlen(accents); 1324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 codepoint = 0; 1325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int cclass = 0; 1326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int result = 0; 1327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp; 1328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (index < length) { 1329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = index; 133083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_NEXT(accents, index, length, codepoint); 1331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_getCombiningClass(codepoint) != cclass) { 1332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cclass = u_getCombiningClass(codepoint); 1333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex[result] = temp; 1334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result ++; 1335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex[result] = length; 1338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Appends 3 UChar arrays to a destination array. 1343b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Creates a new array if we run out of space. The caller will have to 1344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* manually deallocate the newly allocated array. 1345b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1346b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* before calling this method. destination not to be NULL and has at least 1347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* size destinationlength. 1348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destination target array 1349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param destinationlength target array size, returning the appended length 1350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source1 null-terminated first array 1351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source2 second array 1352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source2length length of seond array 1353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param source3 null-terminated third array 1354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 1355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return new destination array, destination if there was no new allocation 1356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1358b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UChar * addToUCharArray( UChar *destination, 1359b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *destinationlength, 1360b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *source1, 1361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *source2, 1362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t source2length, 1363b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *source3, 1364b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 1365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t source1length = source1 ? u_strlen(source1) : 0; 1367b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t source3length = source3 ? u_strlen(source3) : 0; 1368b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (*destinationlength < source1length + source2length + source3length + 1369b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1) 1370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destination = (UChar *)allocateMemory( 1372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (source1length + source2length + source3length + 1) * sizeof(UChar), 1373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1374b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if error allocating memory, status will be 1375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_MEMORY_ALLOCATION_ERROR 1376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = 0; 1378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 1379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source1length != 0) { 1382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source2length != 0) { 1385b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(destination + source1length, source2, 1386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UChar) * source2length); 1387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (source3length != 0) { 1389b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(destination + source1length + source2length, source3, 1390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UChar) * source3length); 1391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *destinationlength = source1length + source2length + source3length; 1393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return destination; 1394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Running through a collation element iterator to see if the contents matches 1398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* pattern in string search data 1399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 1401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a match if found, FALSE otherwise 1402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1404b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkCollationMatch(const UStringSearch *strsrch, 1405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter) 1406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1407f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int patternceindex = strsrch->pattern.cesLength; 1408f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 1409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (patternceindex > 0) { 1411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status) || ce != *patternce) { 1416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternce ++; 1419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 1420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Rearranges the front accents to try matching. 1426b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Prefix accents in the text will be grouped according to their combining 1427b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 1428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 1429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1431b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 1433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 1434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status is assumed to be success, caller has to check status 1435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search match 1437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start first offset of the accents to start searching 1438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end start of the last accent set 1439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the starting 1441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all preceding accents. 1442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1444b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 1446b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 1447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tempstart = start; 1452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // die... failed at a base character 1455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = getPreviousBaseOffset(text, tempstart); 1460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 1462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 1463b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1464b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru INITIAL_ARRAY_SIZE_, status); 1465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1468b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1469b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1470b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsize = getUnblockedAccentIndex(accents, 1471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex); 1472b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (accentsize - 1)) - 1; 1473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 1474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 1475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 1476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalPrefixAccents; 1477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 1478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 1479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 1480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 1482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 1483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= accentsize - 1; i ++) { 1484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (accentsize - i - 1); 1485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 1486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 1488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 1492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchsize = INITIAL_ARRAY_SIZE_; 1493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *match = addToUCharArray(buffer, &matchsize, 1494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents, 1495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text + offset, 1496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end - offset, 1497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents, 1498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1499b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing. 1501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // run the collator iterator through this match 1502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, match, matchsize, status); 1503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 1504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkCollationMatch(strsrch, coleiter)) { 1505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match != buffer) { 1506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(match); 1507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return start; 1509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the offset to the safe point in text before textoffset. 1518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ie. not the middle of a contraction, swappable characters or supplementary 1519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* characters. 1520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param collator collation sata 1521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string to work with 1522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in string 1523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textlength length of text string 1524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return offset to the previous safe character 1525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1527b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline uint32_t getPreviousSafeOffset(const UCollator *collator, 1528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 1529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = textoffset; // first contraction character 1532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result --; 1534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != 0) { 1536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first contraction character is consider unsafe here 1537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result --; 1538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1539b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return result; 1540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Cleaning up after we passed the safe zone 1544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param safetext safe text array 1546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param safebuffer safe text buffer 1547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator for safe text 1548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safebuffer) 1552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1553b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(safetext); 1556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Take the rearranged end accents and tries matching. If match failed at 1561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a seperate preceding set of accents (seperated from the rearranged on by 1562b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* at least a base character) then we rearrange the preceding accents and 1563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tries matching again. 1564b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We allow skipping of the ends of the accent set if the ces do not match. 1565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* However if the failure is found before the accent set, it fails. 1566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset of the start of the rearranged accent 1570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the starting 1572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all preceding accents. 1573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1575b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 1577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safelength = 0; 1582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safetext; 1583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safetextlength; 1584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 1586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safeoffset = textoffset; 1587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1588b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru collator)) { 1590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safelength = textoffset - safeoffset; 1592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = INITIAL_ARRAY_SIZE_; 1593b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1594b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru text + safeoffset, safelength, 1595b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents, 1596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = strsrch->canonicalSuffixAccents; 1601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing 1604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, safetext, safetextlength, status); 1605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 1606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1607f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *ce = strsrch->pattern.ces; 1608f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t celength = strsrch->pattern.cesLength; 1609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ceindex = celength - 1; 1610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isSafe = TRUE; // indication flag for position in safe zone 1611b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ceindex >= 0) { 1613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textce = ucol_previous(coleiter, status); 1614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == UCOL_NULLORDER) { 1621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check if we have passed the safe buffer 1622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == strsrch->textIter) { 1623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = safebuffer; 1628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru coleiter = strsrch->textIter; 1629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, safeoffset); 1630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at the start of the loop 1631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru isSafe = FALSE; 1632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textce = getCE(strsrch, textce); 1635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do the beginning stuff 1637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe && failedoffset >= safelength) { 1639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alas... no hope. failed at rearranged accent set 1640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset += safeoffset; 1646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1648b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // try rearranging the front accents 1650b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t result = doNextCanonicalPrefixMatch(strsrch, 1651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset, textoffset, status); 1652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != USEARCH_DONE) { 1653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 1654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 1655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 1658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == ce[ceindex]) { 1663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceindex --; 1664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set offset here 1667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 1668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = getColElemIterOffset(coleiter, FALSE); 1669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets the text iterator here with the correct expansion and offset 1670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t leftoverces = getExpansionPrefix(coleiter); 1671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 1672b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (result >= safelength) { 1673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset; 1674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result += safeoffset; 1677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 1679b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->textIter->iteratordata_.toReturn = 1680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setExpansionPrefix(strsrch->textIter, leftoverces); 1681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 1682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1683b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1684b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ucol_getOffset(coleiter); 1685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Trying out the substring and sees if it can be a canonical match. 1689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This will try normalizing the end accents and arranging them into canonical 1690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* equivalents and check their corresponding ces with the pattern ce. 1691b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Suffix accents in the text will be grouped according to their combining 1692b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 1693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 1694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1696b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 1698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 1699b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1702b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param textoffset end offset in the collation element text that ends with 1703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the accents to be rearranged 1704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 1705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1708b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool doNextCanonicalMatch(UStringSearch *strsrch, 1709b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t textoffset, 1710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 171483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, temp); 1715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getColElemIterOffset(coleiter, FALSE); 1718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents) { 1719b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, offset); 1723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->pattern.hasSuffixAccents) { 1730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 1734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // offset to the last base character in substring to search 1735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 1737b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1738b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0, accents, INITIAL_ARRAY_SIZE_, status); 1739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 1740b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 power n - 1 plus the full set of accents 1745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = (2 << (size - 1)) - 1; 1746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 1747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalSuffixAccents; 1748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 1749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 1750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 1751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 1753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 1754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= size - 1; i ++) { 1755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (size - i - 1); 1756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 1757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 1759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 1763b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 1765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != USEARCH_DONE) { 1766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; // match found 1767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 1769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1774b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Gets the previous base character offset depending on the string search 1775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* pattern data 1776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset current offset, current character 1778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return the offset of the next character after this base character or itself 1779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* if it is a composed character with accents 1780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1782b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset) 1784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = textoffset; 1788b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (getFCD(text, &offset, strsrch->search->textLength) >> 1789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru SECOND_LAST_BYTE_SHIFT_) { 1790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return getPreviousBaseOffset(text, textoffset); 1791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1797b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 1798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match ends with a partial contraction we fail. 1799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts too far off (because of backwards iteration) we try to 1800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* chip off the extra characters 1801b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 1805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 1806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 1808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1810b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 1812b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, 1813b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 1814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *start; 1818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 1819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 1820b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either ends of the match contains potential 1821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 1822b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1823b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (*start + 1 < textlength 1824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru && ucol_unsafeCP(text[*start + 1], collator))) { 1825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionPrefix(coleiter); 1826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 1827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *start); 1828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 1829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce, caused by setOffset. 1830b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // since backward contraction/expansion may have extra ces if we 1831b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // are in the normalization buffer, hasAccentsBeforeMatch would 1832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have taken care of it. 1833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 1834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we are only looking for acute and ring \u030A and \u0301, we'll 1835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // have to skip the first ce in the expansion buffer. 1836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_next(coleiter, status); 1837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 1838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 1841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 1845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1847f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 1848f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 1849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 1850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 1851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count < patterncelength) { 1852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 1854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_next returns UCOL_NULLORDER 1855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 1856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 1857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = temp; 1860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 1861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count == 0 && ce != patternce[0]) { 1864b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accents may have extra starting ces, this occurs when a 1865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pure accent pattern is matched without rearrangement 1866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // text \u0325\u0300 and looking for \u0300 1867b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t expected = patternce[0]; 1868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, status)); 1870b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (U_SUCCESS(*status) && ce != expected && 1871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != UCOL_NULLORDER && 1872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter) <= *end) { 1873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_next(coleiter, status)); 1874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count]) { 1878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*end) ++; 1879b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *end = getNextUStringSearchBaseOffset(strsrch, *end); 1880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 1883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1884b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 1885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 1890b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 1891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 1892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 1893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 1894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 1895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 1896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 1897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 1898b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check the 1899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* status before calling this method. 1900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 1901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 1902b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated end offset of the match or the new start 1903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 1904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 1905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 1906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1908b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1909b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 1910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 1911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 1913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 1914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we have a canonical accent match 1915b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->pattern.hasSuffixAccents && 1916b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents[0]) || 1917b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->pattern.hasPrefixAccents && 1918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0])) { 1919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch, 1921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter)); 1922b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->matchedLength = *textoffset - 1923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex; 1924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = getColElemIterOffset(coleiter, FALSE); 1928b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status) || U_FAILURE(*status)) { 1930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1932b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = getPreviousUStringSearchBaseOffset(strsrch, start); 1934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 1935b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, start, *textoffset) || 1936b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !isBreakUnit(strsrch, start, *textoffset) || 1937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, start, *textoffset)) { 1938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) ++; 1939b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength); 1941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 1942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1943b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = start; 1945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = *textoffset - start; 1946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 1947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Shifting the collation element iterator position forward to prepare for 1951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a preceding match. If the first character is a unsafe character, we'll only 1952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shift by 1 to capture contractions, normalization etc. 1953b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 1954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 1955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text strsrch string search data 1956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset start text position to do search 1957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param ce the text ce which failed the match. 1958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param patternceindex index of the ce within the pattern ce buffer which 1959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* failed the match 1960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return final offset 1961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 1962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 1963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruinline int32_t reverseShift(UStringSearch *strsrch, 1964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 1965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce, 1966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex) 1967b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 1968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isOverlap) { 1969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textoffset != strsrch->search->textLength) { 1970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset --; 1971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= strsrch->pattern.defaultShiftSize; 1974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce != UCOL_NULLORDER) { 1978c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)]; 1979b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 1980b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is to adjust for characters in the middle of the substring 1981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for matching that failed. 1982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t adjust = patternceindex; 1983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (adjust > 1 && shift > adjust) { 1984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru shift -= adjust - 1; 1985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= shift; 1987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 1989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset -= strsrch->pattern.defaultShiftSize; 1990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1991b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 1992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return textoffset; 1994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 1997b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 1998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts with a partial contraction we fail. 1999b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 2003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 2004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 2006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2008b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2009b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 2010b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 2011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *end; 2015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2017b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either if the start of the match contains potential 2018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 2019b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Since we used ucol_next while previously looking for the potential 2020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match, this guarantees that our end will not be a partial contraction, 2021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or a partial supplementary character. 2022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionSuffix(coleiter); 2024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 2025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *end); 2026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && expansion > 0) { 2027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce 2028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since forward contraction/expansion may have extra ces 2029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are in the normalization buffer, hasAccentsBeforeMatch 2030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // would have taken care of it. 2031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 2032b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // we are only looking for A ring A\u030A, we'll have to skip the 2033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // last ce in the expansion buffer 2034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_previous(coleiter, status); 2035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 2039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 2043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2045f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 2046f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 2047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = patterncelength; 2048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count > 0) { 2049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 2051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_previous returns UCOL_NULLORDER 2052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 2053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2055b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (expandflag && count == 0 && 2056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE) != temp) { 2057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*start) --; 2062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = getPreviousBaseOffset(text, *start); 2063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2067b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 2073b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 2074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 2075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the current match does not repeat the last match 2076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 2077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> exact matches has no extra accents 2078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 2079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 2080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the preceding character. 2081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2084b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param collator 2085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param coleiter collation element iterator 2086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param text string 2087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 2088b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated start offset of the match or the new start 2089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 2090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2094b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2095b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 2096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 2099b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end = ucol_getOffset(strsrch->textIter); 2100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru || U_FAILURE(*status)) { 2102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2104b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 2106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the old match 2107b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, *textoffset, end) || 2108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !isBreakUnit(strsrch, *textoffset, end) || 2109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2110b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !checkIdentical(strsrch, *textoffset, end) || 2111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) --; 2113b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getPreviousBaseOffset(strsrch->search->text, 2114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *textoffset); 2115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2117b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //Add breakiterator boundary check for primary strength search. 2119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho checkBreakBoundary(strsrch, textoffset, &end); 2121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2122b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = end - *textoffset; 2125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Rearranges the end accents to try matching. 2130b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Suffix accents in the text will be grouped according to their combining 2131b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 2132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 2133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2135b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 2137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 2138b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, user has to check status 2139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search match 2141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of the first base character 2142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end start of the last accent set 2143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the ending 2145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all following accents. 2146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2148b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start, 2150b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t end, 2151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tempend = end; 2155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 215683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, tempend); 2157b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru LAST_BYTE_MASK_)) { 2159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // die... failed at a base character 2160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end = getNextBaseOffset(text, end, strsrch->search->textLength); 2163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 2166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = getPreviousBaseOffset(text, end); 2167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 2168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru INITIAL_ARRAY_SIZE_, status); 2170b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2171b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t accentsize = getUnblockedAccentIndex(accents, 2173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru accentsindex); 2174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (accentsize - 1)) - 1; 2175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar buffer[INITIAL_ARRAY_SIZE_]; 2176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 2177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 2178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalSuffixAccents; 2179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 2180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 2181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 2182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 2184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 2185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= accentsize - 1; i ++) { 2186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (accentsize - i - 1); 2187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 2188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 2190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 2194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchsize = INITIAL_ARRAY_SIZE_; 2195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *match = addToUCharArray(buffer, &matchsize, 2196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents, 2197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text + start, 2198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset - start, 2199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents, 2200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status); 2201b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // run the collator iterator through this match 2203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure ucol_setText does nothing 2204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, match, matchsize, status); 2205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkCollationMatch(strsrch, coleiter)) { 2207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match != buffer) { 2208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(match); 2209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return end; 2211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Take the rearranged start accents and tries matching. If match failed at 2221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* a seperate following set of accents (seperated from the rearranged on by 2222b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* at least a base character) then we rearrange the preceding accents and 2223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tries matching again. 2224b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* We allow skipping of the ends of the accent set if the ces do not match. 2225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* However if the failure is found before the accent set, it fails. 2226b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset of the ends of the rearranged accent 2230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return USEARCH_DONE if a match is not found, otherwise return the ending 2232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offset of the match. Note this start includes all following accents. 2233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2235b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruint32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset, 2237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safelength = 0; 2242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *safetext; 2243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safetextlength; 2244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t safeoffset = textoffset; 2246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2247b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (textoffset && 2248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_strlen(strsrch->canonicalPrefixAccents) - 1 2250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ], collator)) { 2251b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safeoffset = getNextSafeOffset(collator, text, textoffset, 2252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength); 2253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safelength = safeoffset - textoffset; 2254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = INITIAL_ARRAY_SIZE_; 2255b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru safetext = addToUCharArray(safebuffer, &safetextlength, 2256b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalPrefixAccents, 2257b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru text + textoffset, safelength, 2258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, status); 2259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = strsrch->canonicalPrefixAccents; 2263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->utilIter; 2266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setText does nothing 2267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(coleiter, safetext, safetextlength, status); 2268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop below 2269b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2270f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *ce = strsrch->pattern.ces; 2271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t celength = strsrch->pattern.cesLength; 2272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ceindex = 0; 2273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isSafe = TRUE; // safe zone indication flag for position 2274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2275b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (ceindex < celength) { 2277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textce = ucol_next(coleiter, status); 2278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == UCOL_NULLORDER) { 2285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check if we have passed the safe buffer 2286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (coleiter == strsrch->textIter) { 2287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru safetext = safebuffer; 2292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru coleiter = strsrch->textIter; 2293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, safeoffset); 2294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at the start of the loop 2295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru isSafe = FALSE; 2296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textce = getCE(strsrch, textce); 2299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do the beginning stuff 2301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t failedoffset = ucol_getOffset(coleiter); 2302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe && failedoffset <= prefixlength) { 2303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alas... no hope. failed at rearranged accent set 2304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru failedoffset = safeoffset - failedoffset; 2310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2312b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // try rearranging the end accents 2314b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset, failedoffset, status); 2316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != USEARCH_DONE) { 2317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 2318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 2319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textce == ce[ceindex]) { 2327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceindex ++; 2328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set offset here 2331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isSafe) { 2332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(coleiter); 2333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets the text iterator here with the correct expansion and offset 2334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t leftoverces = getExpansionSuffix(coleiter); 2335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cleanUpSafeText(strsrch, safetext, safebuffer); 2336b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (result <= prefixlength) { 2337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset; 2338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = textoffset + (safeoffset - result); 2341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, result); 2343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setExpansionSuffix(strsrch->textIter, leftoverces); 2344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2346b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2347b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ucol_getOffset(coleiter); 2348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Trying out the substring and sees if it can be a canonical match. 2352b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* This will try normalizing the starting accents and arranging them into 2353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* canonical equivalents and check their corresponding ces with the pattern ce. 2354b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Prefix accents in the text will be grouped according to their combining 2355b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* class and the groups will be mixed and matched to try find the perfect 2356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* match with the pattern. 2357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2359b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* "\u0301\u0325". 2361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* step 2: check if any of the generated substrings matches the pattern. 2362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2365b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* @param textoffset start offset in the collation element text that starts 2366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* with the accents to be rearranged 2367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status output error status if any 2368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2372b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t textoffset, 2373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = textoffset; 2377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = ucol_getOffset(coleiter); 2381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->pattern.hasSuffixAccents) { 2382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset, status); 2384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, offset); 2386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!strsrch->pattern.hasPrefixAccents) { 2393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar accents[INITIAL_ARRAY_SIZE_]; 2397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // offset to the last base character in substring to search 2398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalizing the offensive string 2400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2401b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0, accents, INITIAL_ARRAY_SIZE_, status); 2402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in loop 2403b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 power n - 1 plus the full set of accents 2408b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t count = (2 << (size - 1)) - 1; 2409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (U_SUCCESS(*status) && count > 0) { 2410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *rearrange = strsrch->canonicalPrefixAccents; 2411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // copy the base characters 2412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int k = 0; k < accentsindex[0]; k ++) { 2413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[k]; 2414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forming all possible canonical rearrangement by dropping 2416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets of accents 2417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int i = 0; i <= size - 1; i ++) { 2418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t mask = 1 << (size - i - 1); 2419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count & mask) { 2420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange ++ = accents[j]; 2422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *rearrange = 0; 2426b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru baseoffset, status); 2428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != USEARCH_DONE) { 2429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; // match found 2430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2437b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks match for contraction. 2438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the match starts with a partial contraction we fail. 2439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param start offset of potential match, to be modified if necessary 2443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param end offset of potential match, to be modified if necessary 2444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if match passes the contraction test, FALSE otherwise 2446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2448b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2449b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *start, 2450b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *end, UErrorCode *status) 2451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 2454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t temp = *end; 2455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator = strsrch->collator; 2456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text = strsrch->search->text; 2457b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This part checks if either if the start of the match contains potential 2458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // contraction. If so we'll have to iterate through them 2459b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Since we used ucol_next while previously looking for the potential 2460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match, this guarantees that our end will not be a partial contraction, 2461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or a partial supplementary character. 2462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expansion = getExpansionSuffix(coleiter); 2464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool expandflag = expansion > 0; 2465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, *end); 2466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (expansion > 0) { 2467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // getting rid of the redundant ce 2468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // since forward contraction/expansion may have extra ces 2469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are in the normalization buffer, hasAccentsBeforeMatch 2470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // would have taken care of it. 2471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // E.g. the character \u01FA will have an expansion of 3, but if 2472b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // we are only looking for A ring A\u030A, we'll have to skip the 2473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // last ce in the expansion buffer 2474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_previous(coleiter, status); 2475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ucol_getOffset(coleiter) != temp) { 2479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expansion --; 2483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2485f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 2486f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 2487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = patterncelength; 2488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (count > 0) { 2489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below, note that if status is a failure 2491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_previous returns UCOL_NULLORDER 2492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ce == UCOL_IGNORABLE) { 2493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2495b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (expandflag && count == 0 && 2496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE) != temp) { 2497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *end = temp; 2498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru temp = ucol_getOffset(coleiter); 2499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2500b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (count == patterncelength && 2501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != patternce[patterncelength - 1]) { 2502b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // accents may have extra starting ces, this occurs when a 2503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pure accent pattern is matched without rearrangement 2504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expected = patternce[patterncelength - 1]; 250583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(text, 0, *end); 2506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_previous(coleiter, status)); 2508b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (U_SUCCESS(*status) && ce != expected && 2509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce != UCOL_NULLORDER && 2510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_getOffset(coleiter) <= *start) { 2511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ce = getCE(strsrch, ucol_previous(coleiter, status)); 2512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*start) --; 2517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *start = getPreviousBaseOffset(text, *start); 2518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 2521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2522b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 2527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Checks and sets the match information if found. 2528b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Checks 2529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <ul> 2530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> the potential match does not repeat the previous match 2531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> boundaries are correct 2532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> potential match does not end in the middle of a contraction 2533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <li> identical matches 2534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* <\ul> 2535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Otherwise the offset will be shifted to the next character. 2536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Internal method, status assumed to be success, caller has to check status 2537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* before calling this method. 2538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 2539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param textoffset offset in the collation element text. the returned value 2540b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* will be the truncated start offset of the match or the new start 2541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* search offset. 2542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status only error status if any 2543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if the match is valid, FALSE otherwise 2544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 2545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic 2546b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruinline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2547b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *textoffset, 2548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to ensure that the start and ends are not composite characters 2551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 2552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we have a canonical accent match 2553b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->pattern.hasSuffixAccents && 2554b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->canonicalSuffixAccents[0]) || 2555b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->pattern.hasPrefixAccents && 2556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0])) { 2557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2558b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->matchedLength = 2559b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru getNextUStringSearchBaseOffset(strsrch, 2560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru getColElemIterOffset(coleiter, FALSE)) 2561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru - *textoffset; 2562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = ucol_getOffset(coleiter); 2566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2567b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status) || 2568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_FAILURE(*status)) { 2569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru end = getNextUStringSearchBaseOffset(strsrch, end); 2573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this totally matches, however we need to check if it is repeating 2574b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (checkRepeatedMatch(strsrch, *textoffset, end) || 2575b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru !isBreakUnit(strsrch, *textoffset, end) || 2576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru !checkIdentical(strsrch, *textoffset, end)) { 2577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (*textoffset) --; 2578b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *textoffset = getPreviousBaseOffset(strsrch->search->text, 2579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *textoffset); 2580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2582b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = *textoffset; 2584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = end - *textoffset; 2585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2587c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif // #if BOYER_MOORE 2588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// constructors and destructor ------------------------------------------- 2590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2591b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2592b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t patternlength, 2593b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text, 2594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *locale, 2596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2597b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 2598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if UCONFIG_NO_BREAK_ITERATION 2603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter != NULL) { 2604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (locale) { 2609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ucol_open internally checks for status 2610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollator *collator = ucol_open(locale, status); 2611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern, text checks are done in usearch_openFromCollator 2612b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UStringSearch *result = usearch_openFromCollator(pattern, 2613b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru patternlength, text, textlength, 2614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru collator, breakiter, status); 2615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL || U_FAILURE(*status)) { 2617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (collator) { 2618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close(collator); 2619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ownCollator = TRUE; 2624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2632b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *pattern, 2633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternlength, 2634b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UChar *text, 2635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator, 2637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2638b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 2639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if UCONFIG_NO_BREAK_ITERATION 2644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter != NULL) { 2645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pattern == NULL || text == NULL || collator == NULL) { 2650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string search does not really work when numeric collation is turned on 2655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 2657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initializeFCD(status); 2662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UStringSearch *result; 2667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength == -1) { 2668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength = u_strlen(text); 2669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == -1) { 2671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength = u_strlen(pattern); 2672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength <= 0 || patternlength <= 0) { 2674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2677b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result == NULL) { 2680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 2681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->collator = collator; 2685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->strength = ucol_getStrength(collator); 2686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ceMask = getMask(result->strength); 2687b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->toShift = 2688b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_SHIFTED; 2690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->variableTop = ucol_getVariableTop(collator, status); 2691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2692f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius result->nfd = Normalizer2::getNFDInstance(*status); 269350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(result); 2696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result->search == NULL) { 2701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 2702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(result); 2703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->text = text; 2707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->textLength = textlength; 2708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->pattern.text = pattern; 2710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->pattern.textLength = patternlength; 2711f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius result->pattern.ces = NULL; 2712f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius result->pattern.pces = NULL; 2713b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->breakIter = breakiter; 2715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2716b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter) { 2718b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_setText(breakiter, text, textlength, status); 2719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->ownCollator = FALSE; 2723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->matchedLength = 0; 2724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->matchedIndex = USEARCH_DONE; 2725c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru result->utilIter = NULL; 2726b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru result->textIter = ucol_openElements(collator, text, 2727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength, status); 2728fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius result->textProcessedIter = NULL; 2729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_close(result); 2731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isOverlap = FALSE; 2735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isCanonicalMatch = FALSE; 273650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho result->search->elementComparisonType = 0; 2737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->isForwardSearching = TRUE; 2738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result->search->reset = TRUE; 2739b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(result, status); 2741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_close(result); 2744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2755f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && 2756f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strsrch->pattern.ces) { 2757f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uprv_free(strsrch->pattern.ces); 2758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2759c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2760f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.pces != NULL && 2761f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 2762f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uprv_free(strsrch->pattern.pces); 2763c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2764c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2765fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete strsrch->textProcessedIter; 2766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_closeElements(strsrch->textIter); 2767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_closeElements(strsrch->utilIter); 2768c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ownCollator && strsrch->collator) { 2770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close((UCollator *)strsrch->collator); 2771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2772c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2773c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->internalBreakIter) { 2775b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_close(strsrch->search->internalBreakIter); 2776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2777c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 2778c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(strsrch->search); 2780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_free(strsrch); 2781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2784fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 2785fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 2786fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 2787fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (U_FAILURE(*status)) { return FALSE; } 2788fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (strsrch->textProcessedIter == NULL) { 2789fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 2790fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (strsrch->textProcessedIter == NULL) { 2791fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *status = U_MEMORY_ALLOCATION_ERROR; 2792fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 2793fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 2794fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 2795fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->textProcessedIter->init(strsrch->textIter); 2796fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 2797fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 2798fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 2799fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 2800fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 2801fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 2802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// set and get methods -------------------------------------------------- 2803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2804b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 2806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isOutOfBounds(strsrch->search->textLength, position)) { 2810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_INDEX_OUTOFBOUNDS_ERROR; 2811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, position); 2814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 2816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 2817b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->reset = FALSE; 2818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t result = ucol_getOffset(strsrch->textIter); 2825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (isOutOfBounds(strsrch->search->textLength, result)) { 2826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 2829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2832b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2833b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttribute attribute, 2835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttributeValue value, 2836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (attribute) 2840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_OVERLAP : 2842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_CANONICAL_MATCH : 2845b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FALSE); 2847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 284850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case USEARCH_ELEMENT_COMPARISON : 284950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 285050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = (int16_t)value; 285150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 285250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = 0; 285350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 285450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 2855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_ATTRIBUTE_COUNT : 2856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 2857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2864b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch, 2867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearchAttribute attribute) 2868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (attribute) { 2871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_OVERLAP : 2872b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USEARCH_OFF); 2874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_CANONICAL_MATCH : 2875b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USEARCH_OFF); 287750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case USEARCH_ELEMENT_COMPARISON : 287850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 287950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int16_t value = strsrch->search->elementComparisonType; 288050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 288150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (USearchAttributeValue)value; 288250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 288350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return USEARCH_STANDARD_ELEMENT_COMPARISON; 288450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 288550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 2886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case USEARCH_ATTRIBUTE_COUNT : 2887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DEFAULT; 2888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DEFAULT; 2891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch) 2895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch == NULL) { 2897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->matchedIndex; 2900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2903b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2904b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar *result, 2905b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t resultCapacity, 2906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 2909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2911b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result == NULL)) { 2913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t copylength = strsrch->search->matchedLength; 2918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t copyindex = strsrch->search->matchedIndex; 2919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (copyindex == USEARCH_DONE) { 2920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_terminateUChars(result, resultCapacity, 0, status); 2921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (resultCapacity < copylength) { 2925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copylength = resultCapacity; 2926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (copylength > 0) { 2928b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_memcpy(result, strsrch->search->text + copyindex, 2929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copylength * sizeof(UChar)); 2930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2931b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return u_terminateUChars(result, resultCapacity, 2932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength, status); 2933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2934b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UStringSearch *strsrch) 2937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->matchedLength; 2940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 2942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2946b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakiter, 2948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 2951b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->breakIter = breakiter; 2952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakiter) { 2953b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_setText(breakiter, strsrch->search->text, 2954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength, status); 2955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2959b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UBreakIterator* U_EXPORT2 2960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruusearch_getBreakIterator(const UStringSearch *strsrch) 2961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 2963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->breakIter; 2964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 2966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2967b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2969b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 2970b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text, 2972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength, 2973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 2974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 2976b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strsrch == NULL || text == NULL || textlength < -1 || 2977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength == 0) { 2978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 2979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (textlength == -1) { 2982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength = u_strlen(text); 2983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->text = text; 2985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->textLength = textlength; 2986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setText(strsrch->textIter, text, textlength, status); 2987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 2988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 2989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->reset = TRUE; 2990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 2991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->breakIter != NULL) { 2992b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubrk_setText(strsrch->search->breakIter, text, 2993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textlength, status); 2994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 2997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3001b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 3002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *length) 3003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *length = strsrch->search->textLength; 3006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->search->text; 3007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3011b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator, 3013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (collator == NULL) { 3017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3020c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3022fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete strsrch->textProcessedIter; 3023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->textProcessedIter = NULL; 3024fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_closeElements(strsrch->textIter); 3025fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_closeElements(strsrch->utilIter); 3026fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->textIter = strsrch->utilIter = NULL; 3027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ownCollator && (strsrch->collator != collator)) { 3028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_close((UCollator *)strsrch->collator); 3029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ownCollator = FALSE; 3030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->collator = collator; 3032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->strength = ucol_getStrength(collator); 3033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ceMask = getMask(strsrch->strength); 3034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 3035b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ubrk_close(strsrch->search->internalBreakIter); 3036b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3037b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho strsrch->search->text, strsrch->search->textLength, status); 3038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3040b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->toShift = 3041b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCOL_SHIFTED; 3043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getVariableTop returns 0 3044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->variableTop = ucol_getVariableTop(collator, status); 3045fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->textIter = ucol_openElements(collator, 3046fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->search->text, 3047fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->search->textLength, 3048fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius status); 3049fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strsrch->utilIter = ucol_openElements( 3050fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 3051fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // initialize() _after_ setting the iterators for the new collator. 3052fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius initialize(strsrch, status); 3053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3054c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3055c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** are these calls needed? 3056c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** we call uprv_init_pce in initializePatternPCETable 3057f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // **** and the CEIBuffer constructor... 3058c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3059c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(strsrch->textIter); 3060c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_init_pce(strsrch->utilIter); 3061c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (UCollator *)strsrch->collator; 3069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3073b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *pattern, 3075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternlength, 3076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch == NULL || pattern == NULL) { 3080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == -1) { 3084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternlength = u_strlen(pattern); 3085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patternlength == 0) { 3087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.text = pattern; 3091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.textLength = patternlength; 3092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(strsrch, status); 3093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3097b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI const UChar* U_EXPORT2 3098b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruusearch_getPattern(const UStringSearch *strsrch, 3099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *length) 3100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *length = strsrch->pattern.textLength; 3103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return strsrch->pattern.text; 3104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// miscellanous methods -------------------------------------------------- 3109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3110b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3111b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 3112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, 0, status); 3116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return usearch_next(strsrch, status); 3118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3123b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 3125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position checked in usearch_setOffset 3130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, position, status); 3131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3132b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return usearch_next(strsrch, status); 3133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3137b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3138b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = FALSE; 3143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, strsrch->search->textLength, status); 3144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return usearch_previous(strsrch, status); 3146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3151b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t position, 3153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch && U_SUCCESS(*status)) { 3156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = FALSE; 3157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position checked in usearch_setOffset 3158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_setOffset(strsrch, position, status); 3159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3160b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return usearch_previous(strsrch, status); 3161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3165b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 3167b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* If a direction switch is required, we'll count the number of ces till the 3168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* beginning of the collation element iterator and iterate forwards that 3169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* number of times. This is so that we get to the correct point within the 3170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* string to continue the search in. Imagine when we are in the middle of the 3171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* normalization buffer when the change in direction is request. arrrgghh.... 3172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* After searching the offset within the collation element iterator will be 3173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shifted to the start of the match. If a match is not found, the offset would 3174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* have been set to the end of the text string in the collation element 3175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* iterator. 3176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Okay, here's my take on normalization buffer. The only time when there can 3177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* be 2 matches within the same normalization is when the pattern is consists 3178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of all accents. But since the offset returned is from the text string, we 3179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* should not confuse the caller by returning the second match within the 3180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* same normalization buffer. If we do, the 2 results will have the same match 3181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* offsets, and that'll be confusing. I'll return the next match that doesn't 3182b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* fall within the same normalization buffer. Note this does not affect the 3183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* results of matches spanning the text and the normalization buffer. 3184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The position to start searching is taken from the collation element 3185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* iterator. Callers of this API would have to set the offset in the collation 3186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* element iterator before using this method. 3187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 3188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3190b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 3191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 3192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note offset is either equivalent to the start of the previous match 3193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or is set by the user 3194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = usearch_getOffset(strsrch); 3195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearch *search = strsrch->search; 3196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->reset = FALSE; 3197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = search->textLength; 3198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isForwardSearching) { 3199c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 3200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset == textlength 3201b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru || (!search->isOverlap && 3202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (offset + strsrch->pattern.defaultShiftSize > textlength || 3203b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (search->matchedIndex != USEARCH_DONE && 3204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset + search->matchedLength >= textlength)))) { 3205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not enough characters to match 3206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3207b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return USEARCH_DONE; 3208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset == textlength || 3211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (! search->isOverlap && 3212c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (search->matchedIndex != USEARCH_DONE && 3213c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offset + search->matchedLength > textlength))) { 3214c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // not enough characters to match 3215c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 3216c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return USEARCH_DONE; 3217c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3218c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3221b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // switching direction. 3222b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if matchedIndex == USEARCH_DONE, it means that either a 3223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // setOffset has been called or that previous ran off the text 3224b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // string. the iterator would have been set to offset 0 if a 3225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match is not found. 3226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = TRUE; 3227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex != USEARCH_DONE) { 3228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there's no need to set the collation element iterator 3229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the next call to next will set the offset. 3230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3235f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.cesLength == 0) { 3236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == USEARCH_DONE) { 3237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = offset; 3238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { // moves by codepoints 324083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_FWD_1(search->text, search->matchedIndex, textlength); 3241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3242b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedLength = 0; 3244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, search->matchedIndex); 3245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == textlength) { 3247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = USEARCH_DONE; 3248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedLength > 0) { 3252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if matchlength is 0 we are at the start of the iteration 3253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isOverlap) { 3254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, offset + 1, status); 3255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3257b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucol_setOffset(strsrch->textIter, 3258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset + search->matchedLength, status); 3259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for boundary check purposes. this will ensure that the 3263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // next match will not preceed the current offset 3264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note search->matchedIndex will always be set to something 3265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the code 3266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedIndex = offset - 1; 3267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isCanonicalMatch) { 3270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // can't use exact here since extra accents are allowed. 3271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handleNextCanonical(strsrch, status); 3272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handleNextExact(strsrch, status); 3275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 3279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !BOYER_MOORE 3283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (search->matchedIndex == USEARCH_DONE) { 3284c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, search->textLength, status); 3285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 3286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3289c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 3298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status) && strsrch) { 3300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset; 3301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru USearch *search = strsrch->search; 3302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->reset) { 3303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = search->textLength; 3304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = FALSE; 3305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->reset = FALSE; 3306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, offset); 3307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = usearch_getOffset(strsrch); 3310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3311b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchedindex = search->matchedIndex; 3313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->isForwardSearching == TRUE) { 3314b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // switching direction. 3315b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // if matchedIndex == USEARCH_DONE, it means that either a 3316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // setOffset has been called or that next ran off the text 3317b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // string. the iterator would have been set to offset textLength if 3318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a match is not found. 3319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->isForwardSearching = FALSE; 3320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (matchedindex != USEARCH_DONE) { 3321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return matchedindex; 3322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3325c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 3326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset == 0 || matchedindex == 0 || 3327b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (!search->isOverlap && 3328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (offset < strsrch->pattern.defaultShiftSize || 3329b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (matchedindex != USEARCH_DONE && 3330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru matchedindex < strsrch->pattern.defaultShiftSize)))) { 3331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not enough characters to match 3332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3333b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return USEARCH_DONE; 3334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3335c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3336c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Could check pattern length, but the 3337c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // linear search will do the right thing 3338c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (offset == 0 || matchedindex == 0) { 3339c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 3340c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return USEARCH_DONE; 3341c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3342c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*status)) { 3346f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.cesLength == 0) { 3347b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru search->matchedIndex = 3348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (matchedindex == USEARCH_DONE ? offset : matchedindex); 3349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (search->matchedIndex == 0) { 3350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 3351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { // move by codepoints 335483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius U16_BACK_1(search->text, 0, search->matchedIndex); 3355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(strsrch->textIter, search->matchedIndex); 3356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru search->matchedLength = 0; 3358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->isCanonicalMatch) { 3362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // can't use exact here since extra accents are allowed. 3363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handlePreviousCanonical(strsrch, status); 3364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usearch_handlePreviousExact(strsrch, status); 3368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked below 3369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 3373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3375b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return search->matchedIndex; 3377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return USEARCH_DONE; 3380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3383b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3386b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru /* 3387b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru reset is setting the attributes that are already in 3388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru string search, hence all attributes in the collator should 3389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru be retrieved without any problems 3390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 3391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch) { 3392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool sameCollAttribute = TRUE; 3394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ceMask; 3395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool shift; 3396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t varTop; 3397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3398b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // **** hack to deal w/ how processed CEs encode quaternary **** 3399b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3401b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3402b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru sameCollAttribute = FALSE; 3403b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 3404b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->strength = ucol_getStrength(strsrch->collator); 3406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ceMask = getMask(strsrch->strength); 3407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->ceMask != ceMask) { 3408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->ceMask = ceMask; 3409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3411b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3413b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status) == UCOL_SHIFTED; 3415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->toShift != shift) { 3416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->toShift = shift; 3417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_getVariableTop returns 0 3421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru varTop = ucol_getVariableTop(strsrch->collator, &status); 3422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->variableTop != varTop) { 3423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->variableTop = varTop; 3424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sameCollAttribute = FALSE; 3425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!sameCollAttribute) { 3427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru initialize(strsrch, &status); 3428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_setText(strsrch->textIter, strsrch->search->text, 3430b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru strsrch->search->textLength, 343150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &status); 3432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedLength = 0; 3433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->matchedIndex = USEARCH_DONE; 3434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isOverlap = FALSE; 3435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isCanonicalMatch = FALSE; 343650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho strsrch->search->elementComparisonType = 0; 3437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->isForwardSearching = TRUE; 3438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->search->reset = TRUE; 3439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3442c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3443c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// CEI Collation Element + source text index. 3444c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// These structs are kept in the circular buffer. 3445c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3446c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustruct CEI { 3447c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t ce; 3448c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lowIndex; 3449c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t highIndex; 3450c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}; 3451c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3452c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_BEGIN 3453c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 3455c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3456f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// CEIBuffer A circular buffer of CEs-with-index from the text being searched. 3457c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define DEFAULT_CEBUFFER_SIZE 96 3459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define CEBUFFER_EXTRA 32 3460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Some typical max values to make buffer size more reasonable for asymmetric search. 3461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// #8694 is for a better long-term solution to allocation of this buffer. 3462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3465f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstruct CEIBuffer { 3466c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru CEI *buf; 3468c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t bufSize; 3469c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t firstIx; 3470c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t limitIx; 3471c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UCollationElements *ceIter; 3472c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UStringSearch *strSearch; 3473c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3474c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3475c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3476f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CEIBuffer(UStringSearch *ss, UErrorCode *status); 3477f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ~CEIBuffer(); 3478c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const CEI *get(int32_t index); 3479c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const CEI *getPrevious(int32_t index); 3480c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}; 3481c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3482c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3483f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusCEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { 3484c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf = defBuf; 3485c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strSearch = ss; 3486f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; 3487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (ss->search->elementComparisonType != 0) { 3488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UChar * patText = ss->pattern.text; 3489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (patText) { 3490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UChar * patTextLimit = patText + ss->pattern.textLength; 3491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ( patText < patTextLimit ) { 3492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar c = *patText++; 3493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (MIGHT_BE_JAMO_L(c)) { 3494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 3496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No check for surrogates, we might allocate slightly more buffer than necessary. 3497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3502c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ceIter = ss->textIter; 3503c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx = 0; 3504c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx = 0; 3505c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (!initTextProcessedIter(ss, status)) { return; } 3507c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3508c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3509c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3510c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (buf == NULL) { 3511c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_MEMORY_ALLOCATION_ERROR; 3512c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3514c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3515c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3516c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// TODO: add a reset or init function so that allocated 3517c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// buffers can be retained & reused. 3518c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3519f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusCEIBuffer::~CEIBuffer() { 3520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (buf != defBuf) { 3521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uprv_free(buf); 3522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Get the CE with the specified index. 3527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Index must be in the range 3528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// n-history_size < index < n+1 3529c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// where n is the largest index to have been fetched by some previous call to this function. 3530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3532f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusconst CEI *CEIBuffer::get(int32_t index) { 3533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int i = index % bufSize; 3534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=firstIx && index<limitIx) { 3536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The request was for an entry already in our buffer. 3537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Just return it. 3538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caller is requesting a new, never accessed before, CE. 3542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Verify that it is the next one in sequence, which is all 3543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // that is allowed. 3544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index != limitIx) { 3545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 3546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3550c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Manage the circular CE buffer indexing 3551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx++; 3552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (limitIx - firstIx >= bufSize) { 3554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The buffer is full, knock out the lowest-indexed entry. 3555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx++; 3556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Get the CE with the specified index. 3566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Index must be in the range 3567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// n-history_size < index < n+1 3568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// where n is the largest index to have been fetched by some previous call to this function. 3569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3571f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusconst CEI *CEIBuffer::getPrevious(int32_t index) { 3572c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int i = index % bufSize; 3573c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=firstIx && index<limitIx) { 3575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The request was for an entry already in our buffer. 3576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Just return it. 3577c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3579c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3580c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caller is requesting a new, never accessed before, CE. 3581c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Verify that it is the next one in sequence, which is all 3582c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // that is allowed. 3583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index != limitIx) { 3584c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 3585c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3586c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3587c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3588c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3589c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Manage the circular CE buffer indexing 3590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru limitIx++; 3591c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3592c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (limitIx - firstIx >= bufSize) { 3593c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The buffer is full, knock out the lowest-indexed entry. 3594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru firstIx++; 3595c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3596c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3597c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3600c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return &buf[i]; 3602c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 3605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 3606c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_END 3607c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3608c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3609c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// #define USEARCH_DEBUG 3610c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3611c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 3612c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include <stdio.h> 3613c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include <stdlib.h> 3614c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3615c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3616c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/* 3617c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Find the next break boundary after startIndex. If the UStringSearch object 3618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * has an external break iterator, use that. Otherwise use the internal character 3619c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * break iterator. 3620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 3621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const UChar *text = strsrch->search->text; 3624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textLen = strsrch->search->textLength; 3625b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(startIndex>=0); 3627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(startIndex<=textLen); 3628b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (startIndex >= textLen) { 3630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c; 3634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t i = startIndex; 3635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_NEXT(text, i, textLen, c); 3636b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If we are on a control character, stop without looking for combining marks. 3638c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Control characters do not combine. 3639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3640c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3641c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return i; 3642c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3643b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3644c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The initial character was not a control, and can thus accept trailing 3645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining characters. Advance over however many of them there are. 3646c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t indexOfLastCharChecked; 3647c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 3648c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru indexOfLastCharChecked = i; 3649c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (i>=textLen) { 3650c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3651c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3652c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_NEXT(text, i, textLen, c); 3653c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3654c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3655c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3656c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3658c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return indexOfLastCharChecked; 3659c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#elif !UCONFIG_NO_BREAK_ITERATION 3660c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3661c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator == NULL) { 3663c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakiterator = strsrch->search->internalBreakIter; 3664c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3665c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3666c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator != NULL) { 3667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return ubrk_following(breakiterator, startIndex); 3668c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3669c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3670c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3671c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3672c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** or should we use the original code? **** 3673c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return startIndex; 3674c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3675c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3676c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3677c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3678c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/* 3679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Returns TRUE if index is on a break boundary. If the UStringSearch 3680c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * has an external break iterator, test using that, otherwise test 3681c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * using the internal character break iterator. 3682c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 3683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3684c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3685c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const UChar *text = strsrch->search->text; 3686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textLen = strsrch->search->textLength; 3687b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(index>=0); 3689c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(index<=textLen); 3690b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3691c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index>=textLen || index<=0) { 3692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3693c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3694b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3695c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the character at the current index is not a GRAPHEME_EXTEND 3696c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // then we can not be within a combining sequence. 3697c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c; 3698c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U16_GET(text, 0, index, textLen, c); 3699c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3700c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3701b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3702c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3703b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We are at a combining mark. If the preceding character is anything 3705c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // except a CONTROL, CR or LF, we are in a combining sequence. 3706b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U16_PREV(text, 0, index, c); 3707c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3708b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3709b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return !combining; 3710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#elif !UCONFIG_NO_BREAK_ITERATION 3711c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3712c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3713c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator == NULL) { 3714c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakiterator = strsrch->search->internalBreakIter; 3715c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3716c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3717b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3718c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 3719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // **** or use the original code? **** 3720b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 3721c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3722b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 3723c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3724c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if 0 3725c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 3727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 3728c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *breakiterator = strsrch->search->breakIter; 3729c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3730c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakiterator != NULL) { 3731c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startindex = ubrk_first(breakiterator); 3732c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t endindex = ubrk_last(breakiterator); 3733b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3734c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // out-of-range indexes are never boundary positions 3735c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (start < startindex || start > endindex || 3736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru end < startindex || end > endindex) { 3737c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3738c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3739c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3740b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return ubrk_isBoundary(breakiterator, start) && 3741c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ubrk_isBoundary(breakiterator, end); 3742c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3743c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3744c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3745c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 3746c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3747c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3748c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 374950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehotypedef enum { 375050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH = -1, 375150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_NO_MATCH = 0, 375250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_SKIP_TARG, 375350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_SKIP_PATN 375450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} UCompareCEsResult; 375550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#define U_CE_LEVEL2_BASE 0x00000005 375650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#define U_CE_LEVEL3_BASE 0x00050000 375750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 375850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 375950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (targCE == patCE) { 376050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_MATCH; 376150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 376250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (compareType == 0) { 376350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_NO_MATCH; 376450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 376550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 376650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t targCEshifted = targCE >> 32; 376750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t patCEshifted = patCE >> 32; 376850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t mask; 376950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 377050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0xFFFF0000; 377150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev1 = (int32_t)(targCEshifted & mask); 377250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev1 = (int32_t)(patCEshifted & mask); 377350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev1 != patLev1 ) { 377450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev1 == 0 ) { 377550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_TARG; 377650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 377750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 377850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_PATN; 377950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 378050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_NO_MATCH; 378150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 378250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 378350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0x0000FFFF; 378450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev2 = (int32_t)(targCEshifted & mask); 378550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev2 = (int32_t)(patCEshifted & mask); 378650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev2 != patLev2 ) { 378750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev2 == 0 ) { 378850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_TARG; 378950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 379050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 379150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_SKIP_PATN; 379250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 379350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 379450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH: U_CE_NO_MATCH; 379550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 379650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 379750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mask = 0xFFFF0000; 379850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targLev3 = (int32_t)(targCE & mask); 379950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t patLev3 = (int32_t)(patCE & mask); 380050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( targLev3 != patLev3 ) { 380150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 380250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CE_MATCH: U_CE_NO_MATCH; 380350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 380450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 380550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_CE_MATCH; 380650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 380750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 380850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if BOYER_MOORE 380950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 381050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 3811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3812c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertnamespace { 3813c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 3814c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertUChar32 codePointAt(const USearch &search, int32_t index) { 3815c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (index < search.textLength) { 3816c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UChar32 c; 3817c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert U16_NEXT(search.text, index, search.textLength, c); 3818c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return c; 3819c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 3820c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return U_SENTINEL; 3821c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 3822c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 3823c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertUChar32 codePointBefore(const USearch &search, int32_t index) { 3824c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (0 < index) { 3825c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UChar32 c; 3826c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert U16_PREV(search.text, 0, index, c); 3827c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return c; 3828c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 3829c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return U_SENTINEL; 3830c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 3831c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 3832c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} // namespace 3833c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 3834c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3835c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startIdx, 3836c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchStart, 3837c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchLimit, 3838b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 3839c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 3840c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 3841c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3842c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3843c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3844c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: reject search patterns beginning with a combining char. 3845c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3846c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 3847c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 3848c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Pattern CEs\n"); 3849f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 3850f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printf(" %8x", strsrch->pattern.ces[ii]); 3851c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3852c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n"); 3853c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3854b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3855c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 3856c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Input parameter sanity check. 3857c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: should input indicies clip to the text length 3858c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // in the same way that UText does. 3859f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(strsrch->pattern.cesLength == 0 || 3860c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx < 0 || 3861c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx > strsrch->search->textLength || 3862f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strsrch->pattern.ces == NULL) { 3863c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 3864c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 3865c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3866c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3867f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.pces == NULL) { 3868c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru initializePatternPCETable(strsrch, status); 3869c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3870c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3871c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, startIdx, status); 3872f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CEIBuffer ceb(strsrch, status); 3873c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3874b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3875b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t targetIx = 0; 387650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *targetCEI = NULL; 3877c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t patIx; 3878c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool found; 3879c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3880c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mStart = -1; 3881c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mLimit = -1; 3882c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t minLimit; 3883c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t maxLimit; 3884b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3885b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3886b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 3887c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Outer loop moves over match starting positions in the 3888c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // target CE space. 388950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Here we see the target as a sequence of collation elements, resulting from the following: 389050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 389150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // (for example, digraphs such as IJ may be broken into two characters). 389250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 389350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 389450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 389550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 389650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // then the CE is deleted, so the following code sees only CEs that are relevant. 389750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 389850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 389950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 390050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 3901c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for(targetIx=0; ; targetIx++) 3902c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 3903c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = TRUE; 3904c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Inner loop checks for a match beginning at each 3905c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // position from the outer loop. 390650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targetIxOffset = 0; 390750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t patCE = 0; 3908b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3909b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (compared to the last CE fetched for the previous targetIx value) as we need to go 3910b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3911b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *firstCEI = ceb.get(targetIx); 3912b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (firstCEI == NULL) { 3913b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *status = U_INTERNAL_PROGRAM_ERROR; 3914b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 3915b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 3916b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3917b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 3918f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) { 3919f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius patCE = strsrch->pattern.pces[patIx]; 392050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3921c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compare CE from target string with CE from the pattern. 392250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3923c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which will fail the compare, below. 392450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 392550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_NO_MATCH ) { 3926c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 3927c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 392850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( ceMatch > U_CE_NO_MATCH ) { 392950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_SKIP_TARG ) { 393050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same patCE, next targCE 393150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patIx--; 393250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset++; 393350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { // ceMatch == U_CE_SKIP_PATN 393450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same targCE, next patCE 393550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset--; 393650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3937c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3939f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far 3940c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 394150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3942c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at this targetIx. Try again at the next. 3943c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 3944c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3945c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3946c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!found) { 3947c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at all, we have run off the end of the target text. 3948c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3949c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3950c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3951c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3952c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have found a match in CE space. 3953c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Now determine the bounds in string index space. 3954c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There still is a chance of match failure if the CE range not correspond to 3955c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an acceptable character range. 3956c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 395750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3958c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3959c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = firstCEI->lowIndex; 3960c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru minLimit = lastCEI->lowIndex; 3961c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3962c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the CE following the match. If it is UCOL_NULLORDER the match 3963c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // extended to the end of input, and the match is good. 3964c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3965c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the high and low indices of the CE following the match. If 3966c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // they are the same it means one of two things: 3967c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 1. The match extended to the last CE from the target text, which is OK, or 3968c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2. The last CE that was part of the match is in an expansion that extends 3969c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to the first CE after the match. In this case, we reject the match. 3970b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *nextCEI = 0; 397150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (strsrch->search->elementComparisonType == 0) { 3972b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho nextCEI = ceb.get(targetIx + targetIxOffset); 397350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxLimit = nextCEI->lowIndex; 397450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 397550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho found = FALSE; 397650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 397750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 397850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for ( ; ; ++targetIxOffset ) { 397950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nextCEI = ceb.get(targetIx + targetIxOffset); 398050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxLimit = nextCEI->lowIndex; 3981b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we are at the end of the target too, match succeeds 398250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 398350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 398450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 398550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // As long as the next CE has primary weight of 0, 398650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // it is part of the last target element matched by the pattern; 398750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // make sure it can be part of a match with the last patCE 398850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3989b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3990b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3991b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 3992b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 3993b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 399450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 399550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // target element, but it has non-zero primary weight => match fails 399650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3997b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = false; 3998b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 399950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Else the target CE is not part of an expansion of the last matched element, match succeeds 400050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 4001b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 400250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 400350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 4004c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4005b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4006c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4007c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within a combining sequence. 4008c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This can happen if the pattern itself begins with a combining char, and 4009c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the match found combining marks in the target text that were attached 4010c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to something else. 4011c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This type of match should be rejected for not completely consuming a 4012c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining sequence. 4013b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mStart)) { 4014c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4015c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4016c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4017c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within an Collation Element Expansion, 4018c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // meaning that the first char of the match is only partially matched. 4019b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // With exapnsions, the first CE will report the index of the source 4020c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // character, and all subsequent (expansions) CEs will report the source index of the 4021b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // _following_ character. 4022c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t secondIx = firstCEI->highIndex; 4023c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mStart == secondIx) { 4024c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4025c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4026b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4027c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Allow matches to end in the middle of a grapheme cluster if the following 4028c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // conditions are met; this is needed to make prefix search work properly in 4029c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Indic, see #11750 4030c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the default breakIter is being used 4031c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the next collation element after this combining sequence 4032c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // - has non-zero primary weight 4033c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // - corresponds to a separate character following the one at end of the current match 4034c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // (the second of these conditions, and perhaps both, may be redundant given the 4035c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // subsequent check for normalization boundary; however they are likely much faster 4036c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // tests in any case) 4037c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the match limit is a normalization boundary 4038c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UBool allowMidclusterMatch = FALSE; 4039c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4040c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert allowMidclusterMatch = 4041c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert strsrch->search->breakIter == NULL && 4042c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4043c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4044c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4045c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4046c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4047c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // If those conditions are met, then: 4048c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4049c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // the match limit may be backed off to a previous break boundary. This handles 4050c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // cases in which mLimit includes target characters that are ignorable with current 4051c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // settings (such as space) and which extend beyond the pattern match. 4052c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4053c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT require that match limit be on a breakIter boundary 4054c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 4055c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Advance the match end position to the first acceptable match boundary. 4056c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This advances the index over any combining charcters. 4057c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = maxLimit; 4058c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (minLimit < maxLimit) { 4059b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // When the last CE's low index is same with its high index, the CE is likely 4060b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // a part of expansion. In this case, the index is located just after the 4061b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // character corresponding to the CEs compared above. If the index is right 4062b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the break boundary, move the position to the next boundary will result 4063b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // incorrect match length when there are ignorable characters exist between 4064b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the position and the next character produces CE(s). See ticket#8482. 4065b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4066b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = minLimit; 4067b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4068b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4069c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Note that we can have nba < maxLimit && nba >= minLImit, in which 4070c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // case we want to set mLimit to nba regardless of allowMidclusterMatch 4071c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // (i.e. we back off mLimit to the previous breakIterator boundary). 4072c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4073b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = nba; 4074b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4075c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4076c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4077b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4078c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4079c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4080c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4081c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4082c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4083b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4084c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (!allowMidclusterMatch) { 4085c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // If advancing to the end of a combining sequence in character indexing space 4086c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // advanced us beyond the end of the match in CE space, reject this match. 4087c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (mLimit > maxLimit) { 4088c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert found = FALSE; 4089c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4090c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4091c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (!isBreakBoundary(strsrch, mLimit)) { 4092c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert found = FALSE; 4093c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4096b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (! checkIdentical(strsrch, mStart, mLimit)) { 4097b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = FALSE; 4098b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 4099b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found) { 4101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4108c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lastToPrint = ceb.limitIx+2; 4109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4112c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n%s\n", found? "match found" : "no match"); 4113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Done. Store back the match bounds to the caller. 4117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found==FALSE) { 4119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = -1; 4120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = -1; 4121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchStart != NULL) { 4124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchStart= mStart; 4125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchLimit != NULL) { 4128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchLimit = mLimit; 4129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return found; 4132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t startIdx, 4136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchStart, 4137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t *matchLimit, 4138b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode *status) 4139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru{ 4140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*status)) { 4141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: reject search patterns beginning with a combining char. 4145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#ifdef USEARCH_DEBUG 4147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Pattern CEs\n"); 4149f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 4150f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printf(" %8x", strsrch->pattern.ces[ii]); 4151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4152c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n"); 4153c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4154b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Input parameter sanity check. 4157c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: should input indicies clip to the text length 4158c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // in the same way that UText does. 4159f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(strsrch->pattern.cesLength == 0 || 4160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx < 0 || 4161c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru startIdx > strsrch->search->textLength || 4162f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strsrch->pattern.ces == NULL) { 4163c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *status = U_ILLEGAL_ARGUMENT_ERROR; 4164c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4165c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4166c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4167f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (strsrch->pattern.pces == NULL) { 4168c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru initializePatternPCETable(strsrch, status); 4169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4171f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CEIBuffer ceb(strsrch, status); 4172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t targetIx = 0; 4173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru /* 4175c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Pre-load the buffer with the CE's for the grapheme 4176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * after our starting position so that we're sure that 4177c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * we can look at the CE following the match when we 4178c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * check the match boundaries. 4179c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * 4180c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * This will also pre-fetch the first CE that we'll 4181c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * consider for the match. 4182c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 4183c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (startIdx < strsrch->search->textLength) { 4184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBreakIterator *bi = strsrch->search->internalBreakIter; 4185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t next = ubrk_following(bi, startIdx); 4186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, next, status); 4188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4189c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (targetIx = 0; ; targetIx += 1) { 4190c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4191c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4192c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4193c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4194c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4195c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ucol_setOffset(strsrch->textIter, startIdx, status); 4196c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4197b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4198c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 419950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const CEI *targetCEI = NULL; 4200c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t patIx; 4201c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool found; 4202c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4203c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t limitIx = targetIx; 4204c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mStart = -1; 4205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t mLimit = -1; 4206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t minLimit; 4207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t maxLimit; 4208b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4209b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4210b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Outer loop moves over match starting positions in the 4212c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // target CE space. 421350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 421450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // But patIx is 0 at the beginning of the pattern and increases toward the end. 421550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 421650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // and the beginning of the base text. 4217c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for(targetIx = limitIx; ; targetIx += 1) 4218c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4219c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = TRUE; 4220b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (compared to the last CE fetched for the previous targetIx value) as we need to go 4222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4223b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *lastCEI = ceb.getPrevious(targetIx); 4224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (lastCEI == NULL) { 4225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *status = U_INTERNAL_PROGRAM_ERROR; 4226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4229c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Inner loop checks for a match beginning at each 4230c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // position from the outer loop. 423150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t targetIxOffset = 0; 4232f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { 4233f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int64_t patCE = strsrch->pattern.pces[patIx]; 4234c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4235f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); 4236c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compare CE from target string with CE from the pattern. 4237c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4238c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which will fail the compare, below. 423950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 424050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_NO_MATCH ) { 4241c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 424350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if ( ceMatch > U_CE_NO_MATCH ) { 424450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if ( ceMatch == U_CE_SKIP_TARG ) { 424550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same patCE, next targCE 424650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patIx++; 424750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset++; 424850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { // ceMatch == U_CE_SKIP_PATN 424950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // redo with same targCE, next patCE 425050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho targetIxOffset--; 425150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 4252c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4253c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4254c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 425550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4256c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at this targetIx. Try again at the next. 4257c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 4258c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4259c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4260c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!found) { 4261c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No match at all, we have run off the end of the target text. 4262c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have found a match in CE space. 4267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Now determine the bounds in string index space. 4268c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There still is a chance of match failure if the CE range not correspond to 4269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an acceptable character range. 4270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); 4272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = firstCEI->lowIndex; 4273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for the start of the match being within a combining sequence. 4275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This can happen if the pattern itself begins with a combining char, and 4276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the match found combining marks in the target text that were attached 4277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // to something else. 4278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This type of match should be rejected for not completely consuming a 4279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // combining sequence. 4280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (!isBreakBoundary(strsrch, mStart)) { 4281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4284c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Look at the high index of the first CE in the match. If it's the same as the 4285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // low index, the first CE in the match is in the middle of an expansion. 4286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mStart == firstCEI->highIndex) { 4287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru found = FALSE; 4288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4289b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4290c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4291b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho minLimit = lastCEI->lowIndex; 4292b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4293b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (targetIx > 0) { 4294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look at the CE following the match. If it is UCOL_NULLORDER the match 4295b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // extended to the end of input, and the match is good. 4296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look at the high and low indices of the CE following the match. If 4298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // they are the same it means one of two things: 4299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 1. The match extended to the last CE from the target text, which is OK, or 4300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 2. The last CE that was part of the match is in an expansion that extends 4301b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // to the first CE after the match. In this case, we reject the match. 4302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho found = FALSE; 4306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4308b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = maxLimit = nextCEI->lowIndex; 4309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4310c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Allow matches to end in the middle of a grapheme cluster if the following 4311c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // conditions are met; this is needed to make prefix search work properly in 4312c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Indic, see #11750 4313c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the default breakIter is being used 4314c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the next collation element after this combining sequence 4315c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // - has non-zero primary weight 4316c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // - corresponds to a separate character following the one at end of the current match 4317c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // (the second of these conditions, and perhaps both, may be redundant given the 4318c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // subsequent check for normalization boundary; however they are likely much faster 4319c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // tests in any case) 4320c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * the match limit is a normalization boundary 4321c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UBool allowMidclusterMatch = FALSE; 4322c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4323c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert allowMidclusterMatch = 4324c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert strsrch->search->breakIter == NULL && 4325c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4326c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4327c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4328c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4329c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4330c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // If those conditions are met, then: 4331c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4332c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // the match limit may be backed off to a previous break boundary. This handles 4333c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // cases in which mLimit includes target characters that are ignorable with current 4334c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // settings (such as space) and which extend beyond the pattern match. 4335c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4336c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // * do NOT require that match limit be on a breakIter boundary 4337c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 4338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Advance the match end position to the first acceptable match boundary. 4339c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // This advances the index over any combining characters. 4340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (minLimit < maxLimit) { 4341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4342c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Note that we can have nba < maxLimit && nba >= minLImit, in which 4343c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // case we want to set mLimit to nba regardless of allowMidclusterMatch 4344c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // (i.e. we back off mLimit to the previous breakIterator boundary). 4345c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = nba; 4347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4348c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4350c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (!allowMidclusterMatch) { 4351c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // If advancing to the end of a combining sequence in character indexing space 4352c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // advanced us beyond the end of the match in CE space, reject this match. 4353c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (mLimit > maxLimit) { 4354c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert found = FALSE; 4355c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4357c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Make sure the end of the match is on a break boundary 4358c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if (!isBreakBoundary(strsrch, mLimit)) { 4359c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert found = FALSE; 4360c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 4361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No non-ignorable CEs after this point. 4365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The maximum position is detected by boundary after 4366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the last non-ignorable CE. Combining sequence 4367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // across the start index will be truncated. 4368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4370c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4372c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4373c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4374c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4375c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4377b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4379b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (! checkIdentical(strsrch, mStart, mLimit)) { 4380b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = FALSE; 4381b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 4382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4383c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found) { 4384c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4385c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4386c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4387c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4388c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #ifdef USEARCH_DEBUG 4389c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (getenv("USEARCH_DEBUG") != NULL) { 4390c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4391c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t lastToPrint = ceb.limitIx+2; 4392c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4393c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4394c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4395c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru printf("\n%s\n", found? "match found" : "no match"); 4396c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4397c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru #endif 4398c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4399c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Done. Store back the match bounds to the caller. 4400c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4401c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (found==FALSE) { 4402c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mLimit = -1; 4403c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mStart = -1; 4404c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4405c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4406c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchStart != NULL) { 4407c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchStart= mStart; 4408c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4409c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4410c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (matchLimit != NULL) { 4411c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *matchLimit = mLimit; 4412c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4413c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4414c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return found; 4415c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4416c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// internal use methods declared in usrchimp.h ----------------------------- 4418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 4429f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 4430f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 4431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status used in setting coleiter offset, since offset is checked in 4434b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // shiftForward before setting the coleiter offset, status never 4435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a failure 4436b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset <= textlength) 4439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t patternceindex = patterncelength - 1; 4441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastce = UCOL_NULLORDER; 4444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finding the last pattern ce match, imagine composite characters 4449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for example: search for pattern A in text \u00C0 4450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we'll have to skip \u0300 the grave first before we get to A 4451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4457b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4458b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this is for the text \u0315\u0300 that requires 4459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalization and pattern \u0300, where \u0315 is ignorable 4460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastce = targetce; 4464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 446550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[patternceindex]) { 4467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //targetce = lastce; 4478b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex > 0) { 4480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lastce = targetce; 4481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 449250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4493b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4495b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = lastce; 4497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4502b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, lastce, 4503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at loop. 4505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = patterncelength; 4506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkNextExactMatch(strsrch, &textoffset, status)) { 4510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked in ucol_setOffset 4511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4517c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4518c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textOffset = ucol_getOffset(strsrch->textIter); 4519c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4529c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textlength = strsrch->search->textLength; 4543f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 4544f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 4545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4546b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool hasPatternAccents = 4547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4548b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4549b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4553b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset <= textlength) 4555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = patterncelength - 1; 4557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastce = UCOL_NULLORDER; 4560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finding the last pattern ce match, imagine composite characters 4565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for example: search for pattern A in text \u00C0 4566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we'll have to skip \u0300 the grave first before we get to A 4567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastce = targetce; 4575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 457650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[patternceindex]) { 4578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4587b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex > 0) { 4589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_previous(coleiter, status); 4590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex --; 460050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4601b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // initializing the rearranged accent array 4605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (hasPatternAccents && !found) { 4606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = doNextCanonicalMatch(strsrch, textoffset, status); 4612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4618b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = shiftForward(strsrch, textoffset, lastce, 4619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // status checked at loop 4621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = patterncelength; 4622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4624b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t textOffset = ucol_getOffset(strsrch->textIter); 4634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4638c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4640c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4641c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4642c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4643c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4644c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4655c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4657f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 4658f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 4659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // shifting it check for setting offset 4662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if setOffset is called previously or there was no previous match, we 4663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // leave the offset as it is. 4664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->matchedIndex != USEARCH_DONE) { 4665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = strsrch->search->matchedIndex; 4666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4667b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4668b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4670b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset >= 0) 4672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = 1; 4674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = UCOL_NULLORDER; 4677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if status is a failure, ucol_setOffset does nothing 4679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4682b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // finding the first pattern ce match, imagine composite 4683b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // characters. for example: search for pattern \u0300 in text 4684b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // \u00C0, we'll have to skip A first before we get to 4685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // \u0300 the grave accent 4686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru firstce = targetce; 4694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4697b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 469850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[0]) { 4700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checking for accents in composite character 4705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //targetce = firstce; 4711b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && (patternceindex < patterncelength)) { 4713b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho firstce = targetce; 4714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 472450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4725b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex ++; 4727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4728b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = firstce; 4730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4735b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4736b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, targetce, 4737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = 0; 4739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4741b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4749c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4750b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t textOffset; 4751b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4752b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->isOverlap) { 4753b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->matchedIndex != USEARCH_DONE) { 4754b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4755b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4756b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // move the start position at the end of possible match 4757b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho initializePatternPCETable(strsrch, status); 4758fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (!initTextProcessedIter(strsrch, status)) { 4759fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setMatchNotFound(strsrch); 4760fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 4761fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 4762f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4763fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4764b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (pce == UCOL_PROCESSED_NULLORDER) { 4765b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the end of the text 4766b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4767b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4768b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4769b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(*status)) { 4770b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setMatchNotFound(strsrch); 4771b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return FALSE; 4772b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4773b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4774b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4775b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4776b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4777b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4778b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4779c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4780c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4781c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4782c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4783c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4784c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4785c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4786c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4787c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4788c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4789c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4790c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4793b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruUBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status) 4795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 4796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4801c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if BOYER_MOORE 4802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *coleiter = strsrch->textIter; 4803f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t *patternce = strsrch->pattern.ces; 4804f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t patterncelength = strsrch->pattern.cesLength; 4805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textoffset = ucol_getOffset(coleiter); 4806b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UBool hasPatternAccents = 4807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4808b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // shifting it check for setting offset 4810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if setOffset is called previously or there was no previous match, we 4811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // leave the offset as it is. 4812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strsrch->search->matchedIndex != USEARCH_DONE) { 4813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru textoffset = strsrch->search->matchedIndex; 4814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4815b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4816b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patterncelength); 4818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4820b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (textoffset >= 0) 4822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternceindex = 1; 4824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t targetce; 4825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool found = FALSE; 4826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t firstce = UCOL_NULLORDER; 4827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4830b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // finding the first pattern ce match, imagine composite 4831b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // characters. for example: search for pattern \u0300 in text 4832b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // \u00C0, we'll have to skip A first before we get to 4833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // \u0300 the grave accent 4834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru firstce = targetce; 4842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4843b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 484450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == patternce[0]) { 4846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the first ce can be a contraction 4847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = TRUE; 4848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!hasExpansion(coleiter)) { 4851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checking for accents in composite character 4852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = firstce; 4858b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 4859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (found && patternceindex < patterncelength) { 4860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = ucol_next(coleiter, status); 4861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = FALSE; 4863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru targetce = getCE(strsrch, targetce); 4866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (targetce == UCOL_IGNORABLE) { 4867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 487050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4871b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru found = found && targetce == patternce[patternceindex]; 4872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex ++; 4873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // initializing the rearranged accent array 4876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (hasPatternAccents && !found) { 4877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalPrefixAccents[0] = 0; 4878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strsrch->canonicalSuffixAccents[0] = 0; 4879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!found) { 4886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*status)) { 4887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4889b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru textoffset = reverseShift(strsrch, textoffset, targetce, 4890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex); 4891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternceindex = 0; 4892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 4893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setColEIterOffset(coleiter, textoffset); 4897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 4898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru setMatchNotFound(strsrch); 4901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 4902c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#else 4903b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t textOffset; 4904b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4905b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->isOverlap) { 4906b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (strsrch->search->matchedIndex != USEARCH_DONE) { 4907b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4908b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4909b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // move the start position at the end of possible match 4910b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho initializePatternPCETable(strsrch, status); 4911fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (!initTextProcessedIter(strsrch, status)) { 4912fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setMatchNotFound(strsrch); 4913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 4914fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 4915f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4916fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4917b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (pce == UCOL_PROCESSED_NULLORDER) { 4918b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // at the end of the text 4919b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 4920b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4921b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4922b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(*status)) { 4923b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setMatchNotFound(strsrch); 4924b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return FALSE; 4925b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4926b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4927b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4928b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 4929b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho textOffset = ucol_getOffset(strsrch->textIter); 4930b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4931b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4932c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t start = -1; 4933c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t end = -1; 4934c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4935c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4936c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedIndex = start; 4937c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru strsrch->search->matchedLength = end - start; 4938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return TRUE; 4939c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4940c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setMatchNotFound(strsrch); 4941c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return FALSE; 4942c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4943c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif 4944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_COLLATION */ 4947