1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/* 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ****************************************************************************** 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Copyright (C) 1996-2009, International Business Machines * 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Corporation and others. All Rights Reserved. * 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ****************************************************************************** 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */ 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/utypes.h" 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if !UCONFIG_NO_COLLATION 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/unistr.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/putil.h" 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/usearch.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cmemory.h" 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/coll.h" 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/tblcoll.h" 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/coleitr.h" 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucoleitr.h" 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/regex.h" // TODO: make conditional on regexp being built. 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/uniset.h" 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/uset.h" 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ustring.h" 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "hash.h" 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "uhash.h" 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "ucol_imp.h" 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unormimp.h" 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/colldata.h" 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/bmsearch.h" 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_BEGIN 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define DELETE_ARRAY(array) uprv_free((void *) (array)) 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstruct CEI 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t order; 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t lowOffset; 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t highOffset; 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass Target : public UMemory 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottpublic: 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status); 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ~Target(); 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott void setTargetString(const UnicodeString *target); 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI *nextCE(int32_t offset); 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI *prevCE(int32_t offset); 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t stringLength(); 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UChar charAt(int32_t offset); 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool isBreakBoundary(int32_t offset); 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t nextBreakBoundary(int32_t offset); 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t nextSafeBoundary(int32_t offset); 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end); 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott void setOffset(int32_t offset); 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott void setLast(int32_t last); 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t getOffset(); 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprivate: 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CEI *ceb; 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t bufferSize; 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t bufferMin; 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t bufferMax; 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t strengthMask; 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UCollationStrength strength; 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t variableTop; 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool toShift; 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UCollator *coll; 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const UnicodeString *targetString; 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const UChar *targetBuffer; 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t targetLength; 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UCollationElements *elements; 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBreakIterator *charBreakIterator; 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottTarget::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status) 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : bufferSize(0), bufferMin(0), bufferMax(0), 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator), 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL) 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott strength = ucol_getStrength(coll); 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott variableTop = ucol_getVariableTop(coll, &status); 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // find the largest expansion 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint8_t maxExpansion = 0; 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) { 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (*expansion > maxExpansion) { 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott maxExpansion = *expansion; 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // room for an extra character on each end, plus 4 for safety 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferSize = patternLength + (2 * maxExpansion) + 4; 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb = NEW_ARRAY(CEI, bufferSize); 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (ceb == NULL) { 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (target != NULL) { 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott setTargetString(target); 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott switch (strength) 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott { 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott default: 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott strengthMask |= UCOL_TERTIARYORDERMASK; 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott /* fall through */ 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case UCOL_SECONDARY: 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott strengthMask |= UCOL_SECONDARYORDERMASK; 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott /* fall through */ 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case UCOL_PRIMARY: 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott strengthMask |= UCOL_PRIMARYORDERMASK; 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottTarget::~Target() 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ubrk_close(charBreakIterator); 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucol_closeElements(elements); 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(ceb); 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid Target::setTargetString(const UnicodeString *target) 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (charBreakIterator != NULL) { 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ubrk_close(charBreakIterator); 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucol_closeElements(elements); 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetString = target; 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (targetString != NULL) { 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetBuffer = targetString->getBuffer(); 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetLength = targetString->length(); 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status); 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucol_forceHanImplicit(elements, &status); 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetBuffer, targetLength, &status); 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetBuffer = NULL; 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetLength = 0; 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CEI *Target::nextCE(int32_t offset) 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t low = -1, high = -1; 177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t order; 178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool cont = FALSE; 179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (offset >= bufferMin && offset < bufferMax) { 181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return &ceb[offset]; 182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (bufferMax >= bufferSize || offset != bufferMax) { 185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return NULL; 186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott do { 189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott low = ucol_getOffset(elements); 190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order = ucol_next(elements, &status); 191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott high = ucol_getOffset(elements); 192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (order == (uint32_t)UCOL_NULLORDER) { 194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott //high = low = -1; 195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott cont = isContinuation(order); 199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order &= strengthMask; 200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { 202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (strength >= UCOL_QUATERNARY) { 203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order &= UCOL_PRIMARYORDERMASK; 204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order = UCOL_IGNORABLE; 206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } while (order == UCOL_IGNORABLE); 209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (cont) { 211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order |= UCOL_CONTINUATION_MARKER; 212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].order = order; 215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].lowOffset = low; 216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].highOffset = high; 217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMax += 1; 219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return &ceb[offset]; 221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CEI *Target::prevCE(int32_t offset) 224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t low = -1, high = -1; 227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t order; 228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool cont = FALSE; 229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (offset >= bufferMin && offset < bufferMax) { 231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return &ceb[offset]; 232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (bufferMax >= bufferSize || offset != bufferMax) { 235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return NULL; 236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott do { 239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott high = ucol_getOffset(elements); 240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order = ucol_previous(elements, &status); 241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott low = ucol_getOffset(elements); 242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (order == (uint32_t)UCOL_NULLORDER) { 244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott cont = isContinuation(order); 248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order &= strengthMask; 249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { 251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (strength >= UCOL_QUATERNARY) { 252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order &= UCOL_PRIMARYORDERMASK; 253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order = UCOL_IGNORABLE; 255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } while (order == UCOL_IGNORABLE); 258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMax += 1; 260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (cont) { 262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott order |= UCOL_CONTINUATION_MARKER; 263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].order = order; 266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].lowOffset = low; 267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[offset].highOffset = high; 268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return &ceb[offset]; 270c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 271c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 272c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t Target::stringLength() 273c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 274c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (targetString != NULL) { 275c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return targetLength; 276c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 277c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 278c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return 0; 279c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 280c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 281c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUChar Target::charAt(int32_t offset) 282c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 283c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (targetString != NULL) { 284c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return targetBuffer[offset]; 285c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 286c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 287c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return 0x0000; 288c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 289c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 290c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid Target::setOffset(int32_t offset) 291c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 292c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 293c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMin = 0; 295c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMax = 0; 296c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 297c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucol_setOffset(elements, offset, &status); 298c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 299c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 300c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid Target::setLast(int32_t last) 301c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 302c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 303c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 304c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMin = 0; 305c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bufferMax = 1; 306c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 307c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[0].order = UCOL_NULLORDER; 308c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[0].lowOffset = last; 309c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ceb[0].highOffset = last; 310c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 311c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucol_setOffset(elements, last, &status); 312c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 313c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 314c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t Target::getOffset() 315c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 316c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ucol_getOffset(elements); 317c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 318c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 319c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool Target::isBreakBoundary(int32_t offset) 320c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 321c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ubrk_isBoundary(charBreakIterator, offset); 322c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 323c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 324c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t Target::nextBreakBoundary(int32_t offset) 325c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 326c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ubrk_following(charBreakIterator, offset); 327c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 328c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 329c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t Target::nextSafeBoundary(int32_t offset) 330c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 331c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (offset < targetLength) { 332c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott //UChar ch = charAt(offset); 333c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UChar ch = targetBuffer[offset]; 334c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 335c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) { 336c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return offset; 337c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 338c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 339c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott offset += 1; 340c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 341c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 342c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return targetLength; 343c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 344c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 345c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end) 346c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 347c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (strength < UCOL_IDENTICAL) { 348c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return TRUE; 349c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 350c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 351c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UChar t2[32], p2[32]; 352c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const UChar *pBuffer = pattern.getBuffer(); 353c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t pLength = pattern.length(); 354c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t length = end - start; 355c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 356c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR; 357c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 358c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2), 359c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott targetBuffer + start, length, 360c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott FALSE, 0, &status); 361c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 362c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // use separate status2 in case of buffer overflow 363c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2), 364c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott pBuffer, pLength, 365c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott FALSE, 0, &status2)) { 366c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return FALSE; // lengths are different 367c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 368c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 369c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // compare contents 370c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UChar *text, *pat; 371c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 372c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(U_SUCCESS(status)) { 373c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott text = t2; 374c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott pat = p2; 375c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else if(status == U_BUFFER_OVERFLOW_ERROR) { 376c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_ZERO_ERROR; 377c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 378c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // allocate one buffer for both decompositions 379c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott text = NEW_ARRAY(UChar, decomplength * 2); 380c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 381c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Check for allocation failure. 382c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (text == NULL) { 383c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return FALSE; 384c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 385c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 386c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott pat = text + decomplength; 387c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 388c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott unorm_decompose(text, decomplength, targetBuffer + start, 389c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott length, FALSE, 0, &status); 390c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 391c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott unorm_decompose(pat, decomplength, pBuffer, 392c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott pLength, FALSE, 0, &status); 393c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 394c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // NFD failed, make sure that u_memcmp() does not overrun t2 & p2 395c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // and that we don't uprv_free() an undefined text pointer 396c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott text = pat = t2; 397c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott decomplength = 0; 398c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 399c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 400c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0); 401c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 402c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(text != t2) { 403c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(text); 404c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 405c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 406c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // return FALSE if NFD failed 407c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return U_SUCCESS(status) && result; 408c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 409c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 410c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define HASH_TABLE_SIZE 257 411c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 412c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass BadCharacterTable : public UMemory 413c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 414c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottpublic: 415c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status); 416c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ~BadCharacterTable(); 417c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 418c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t operator[](uint32_t ce) const; 419c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t getMaxSkip() const; 420c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t minLengthInChars(int32_t index); 421c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 422c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprivate: 423c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static int32_t hash(uint32_t ce); 424c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 425c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t maxSkip; 426c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t badCharacterTable[HASH_TABLE_SIZE]; 427c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 428c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t *minLengthCache; 429c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 430c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 431c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottBadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status) 432c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : minLengthCache(NULL) 433c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 434c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t plen = patternCEs.size(); 435c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 436c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // **** need a better way to deal with this **** 437c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status) || plen == 0) { 438c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 439c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 440c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 441c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t *history = NEW_ARRAY(int32_t, plen); 442c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 443c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (history == NULL) { 444c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 445c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 446c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 447c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 448c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = 0; i < plen; i += 1) { 449c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott history[i] = -1; 450c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 451c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 452c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott minLengthCache = NEW_ARRAY(int32_t, plen + 1); 453c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 454c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (minLengthCache == NULL) { 455c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(history); 456c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 457c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 458c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 459c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 460c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history); 461c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 462c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) { 463c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott badCharacterTable[j] = maxSkip; 464c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 465c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 466c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t p = 1; p < plen; p += 1) { 467c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history); 468c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 469c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Make sure this entry is not bigger than the previous one. 470c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Otherwise, we might skip too far in some cases. 471c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) { 472c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott minLengthCache[p] = minLengthCache[p - 1]; 473c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 474c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 475c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 476c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott minLengthCache[plen] = 0; 477c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 478c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t p = 0; p < plen - 1; p += 1) { 479c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1]; 480c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 481c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 482c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(history); 483c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 484c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 485c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottBadCharacterTable::~BadCharacterTable() 486c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 487c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(minLengthCache); 488c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 489c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 490c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t BadCharacterTable::operator[](uint32_t ce) const 491c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 492c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return badCharacterTable[hash(ce)]; 493c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 494c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 495c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t BadCharacterTable::getMaxSkip() const 496c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 497c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return maxSkip; 498c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 499c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 500c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t BadCharacterTable::minLengthInChars(int32_t index) 501c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 502c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return minLengthCache[index]; 503c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 504c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 505c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t BadCharacterTable::hash(uint32_t ce) 506c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 507c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE; 508c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 509c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 510c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass GoodSuffixTable : public UMemory 511c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 512c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottpublic: 513c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status); 514c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ~GoodSuffixTable(); 515c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 516c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t operator[](int32_t offset) const; 517c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 518c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprivate: 519c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t *goodSuffixTable; 520c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 521c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 522c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottGoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status) 523c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : goodSuffixTable(NULL) 524c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 525c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t patlen = patternCEs.size(); 526c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 527c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // **** need a better way to deal with this **** 528c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status) || patlen <= 0) { 529c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 530c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 531c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 532c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t *suff = NEW_ARRAY(int32_t, patlen); 533c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t start = patlen - 1, end = - 1; 534c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t maxSkip = badCharacterTable.getMaxSkip(); 535c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 536c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (suff == NULL) { 537c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 538c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 539c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 540c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 541c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // initialze suff 542c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott suff[patlen - 1] = patlen; 543c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 544c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = patlen - 2; i >= 0; i -= 1) { 545c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (i > start) means we're inside the last suffix match we found 546c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // ((patlen - 1) - end) is how far the end of that match is from end of pattern 547c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (i - start) is how far we are from start of that match 548c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (i + (patlen - 1) - end) is index of same character at end of pattern 549c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // so if any suffix match at that character doesn't extend beyond the last match, 550c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // it's the suffix for this character as well 551c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (i > start && suff[i + patlen - 1 - end] < i - start) { 552c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott suff[i] = suff[i + patlen - 1 - end]; 553c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 554c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott start = end = i; 555c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 556c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t s = patlen; 557c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 558c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (start >= 0 && patternCEs[start] == patternCEs[--s]) { 559c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott start -= 1; 560c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 561c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 562c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott suff[i] = end - start; 563c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 564c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 565c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 566c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // now build goodSuffixTable 567c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott goodSuffixTable = NEW_ARRAY(int32_t, patlen); 568c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 569c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (goodSuffixTable == NULL) { 570c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(suff); 571c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 572c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 573c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 574c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 575c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 576c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // initialize entries to minLengthInChars of the pattern 577c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = 0; i < patlen; i += 1) { 578c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott goodSuffixTable[i] = maxSkip; 579c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 580c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 581c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t prefix = 0; 582c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 583c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) { 584c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (suff[i] == i + 1) { 585c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // this matching suffix is a prefix of the pattern 586c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1); 587c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 588c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // for any mis-match before this suffix, we should skip 589c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // so that the front of the pattern (i.e. the prefix) 590c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // lines up with the front of the suffix. 591c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (patlen - 1 - i) is the start of the suffix 592c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (prefix < patlen - 1 - i) { 593c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // value of maxSkip means never set... 594c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (goodSuffixTable[prefix] == maxSkip) { 595c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott goodSuffixTable[prefix] = prefixSkip; 596c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 597c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 598c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott prefix += 1; 599c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 600c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 601c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 602c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 603c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = 0; i < patlen - 1; i += 1) { 604c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1); 605c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 606c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 607c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(suff); 608c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 609c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 610c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottGoodSuffixTable::~GoodSuffixTable() 611c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 612c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(goodSuffixTable); 613c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 614c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 615c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t GoodSuffixTable::operator[](int32_t offset) const 616c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 617c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return goodSuffixTable[offset]; 618c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 619c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 620c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch) 621c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 622c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 623c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool BoyerMooreSearch::empty() 624c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 625c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return patCEs->size() <= 0; 626c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 627c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 628c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCollData *BoyerMooreSearch::getData() 629c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 630c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return data; 631c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 632c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 633c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCEList *BoyerMooreSearch::getPatternCEs() 634c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 635c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return patCEs; 636c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 637c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 638c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottBadCharacterTable *BoyerMooreSearch::getBadCharacterTable() 639c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 640c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return badCharacterTable; 641c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 642c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 643c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottGoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable() 644c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 645c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return goodSuffixTable; 646c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 647c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 648c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottBoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, 649c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode &status) 650c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL) 651c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 652c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 653c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status)) { 654c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 655c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 656c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 657c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UCollator *collator = data->getCollator(); 658c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 659c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott patCEs = new CEList(collator, patternString, status); 660c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 661c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (patCEs == NULL || U_FAILURE(status)) { 662c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 663c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 664c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 665c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott badCharacterTable = new BadCharacterTable(*patCEs, data, status); 666c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 667c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (badCharacterTable == NULL || U_FAILURE(status)) { 668c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 669c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 670c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 671c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status); 672c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 673c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (targetString != NULL) { 674c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target = new Target(collator, targetString, patCEs->size(), status); 675c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 676c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 677c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 678c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottBoyerMooreSearch::~BoyerMooreSearch() 679c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 680c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete target; 681c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete goodSuffixTable; 682c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete badCharacterTable; 683c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete patCEs; 684c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 685c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 686c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status) 687c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 688c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status)) { 689c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 690c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 691c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 692c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (target == NULL) { 693c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target = new Target(data->getCollator(), targetString, patCEs->size(), status); 694c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 695c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target->setTargetString(targetString); 696c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 697c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 698c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 699c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. **** 700c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/* 701c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * TODO: 702c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * * deal with trailing (and leading?) ignorables. 703c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * * Adding BoyerMooreSearch object slowed it down. How can we speed it up? 704c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */ 705c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end) 706c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 707c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott /*UCollator *coll =*/ data->getCollator(); 708c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t plen = patCEs->size(); 709c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t tlen = target->stringLength(); 710c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t maxSkip = badCharacterTable->getMaxSkip(); 711c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t tOffset = offset + maxSkip; 712c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 713c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (plen <= 0) { 714c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Searching for a zero length pattern always fails. 715c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott start = end = -1; 716c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return FALSE; 717c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 718c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 719c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (tOffset <= tlen) { 720c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t pIndex = plen - 1; 721c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t tIndex = 0; 722c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t lIndex = 0; 723c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 724c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (tOffset < tlen) { 725c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // **** we really want to skip ahead enough to **** 726c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // **** be sure we get at least 1 non-ignorable **** 727c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // **** CE after the end of the pattern. **** 728c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t next = target->nextSafeBoundary(tOffset + 1); 729c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 730c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target->setOffset(next); 731c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 732c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (lIndex = 0; ; lIndex += 1) { 733c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI *cei = target->prevCE(lIndex); 734c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t low = cei->lowOffset; 735c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t high = cei->highOffset; 736c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 737c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (high == 0 || (low < high && low <= tOffset)) { 738c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (low < tOffset) { 739c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) { 740c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott lIndex -= 1; 741c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 742c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 743c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (high > tOffset) { 744c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tOffset = high; 745c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 746c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 747c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 748c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 749c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 750c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 751c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 752c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target->setLast(tOffset); 753c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott lIndex = 0; 754c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 755c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 756c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tIndex = ++lIndex; 757c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 758c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Iterate backward until we hit the beginning of the pattern 759c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (pIndex >= 0) { 760c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32_t pce = (*patCEs)[pIndex]; 761c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI *tcei = target->prevCE(tIndex++); 762c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 763c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 764c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (tcei->order != pce) { 765c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // There is a mismatch at this position. Decide how far 766c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // over to shift the pattern, then try again. 767c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 768c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex]; 769c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#ifdef EXTRA_CAUTIOUS 770c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t old = tOffset; 771c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 772c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 773c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1); 774c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 775c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (gsOffset > tOffset) { 776c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tOffset = gsOffset; 777c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 778c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 779c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#ifdef EXTRA_CAUTIOUS 780c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Make sure we don't skip backwards... 781c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (tOffset <= old) { 782c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tOffset = old + 1; 783c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 784c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 785c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 786c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 787c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 788c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 789c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott pIndex -= 1; 790c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 791c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 792c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (pIndex < 0) { 793c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // We made it back to the beginning of the pattern, 794c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // which means we matched it all. Return the location. 795c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI firstCEI = *target->prevCE(tIndex - 1); 796c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI lastCEI = *target->prevCE(lIndex); 797c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t mStart = firstCEI.lowOffset; 798c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t minLimit = lastCEI.lowOffset; 799c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t maxLimit = lastCEI.highOffset; 800c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t mLimit; 801c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool found = TRUE; 802c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 803c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott target->setOffset(/*tOffset*/maxLimit); 804c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 805c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CEI nextCEI = *target->nextCE(0); 806c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 807c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (nextCEI.lowOffset > maxLimit) { 808c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott maxLimit = nextCEI.lowOffset; 809c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 810c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 811c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) { 812c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 813c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 814c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 815c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (! target->isBreakBoundary(mStart)) { 816c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 817c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 818c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 819c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (firstCEI.lowOffset == firstCEI.highOffset) { 820c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 821c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 822c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 823c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott mLimit = maxLimit; 824c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (minLimit < maxLimit) { 825c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t nbb = target->nextBreakBoundary(minLimit); 826c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 827c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (nbb >= lastCEI.highOffset) { 828c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott mLimit = nbb; 829c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 830c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 831c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 832c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (mLimit > maxLimit) { 833c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 834c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 835c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 836c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (! target->isBreakBoundary(mLimit)) { 837c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 838c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 839c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 840c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (! target->isIdentical(pattern, mStart, mLimit)) { 841c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott found = FALSE; 842c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 843c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 844c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (found) { 845c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott start = mStart; 846c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott end = mLimit; 847c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 848c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return TRUE; 849c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 850c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 851c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip? 852c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 853c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Otherwise, we're here because of a mismatch, so keep going.... 854c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 855c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 856c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // no match 857c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott start = -1; 858c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott end = -1; 859c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return FALSE; 860c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 861c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 862c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_END 863c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 864c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif // #if !UCONFIG_NO_COLLATION 865