1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)********************************************************************** 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Copyright (C) 2001-2010 IBM and others. All rights reserved. 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)********************************************************************** 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Date Name Description 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 08/13/2001 synwee Creation. 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)********************************************************************** 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#ifndef USRCHIMP_H 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define USRCHIMP_H 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h" 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if !UCONFIG_NO_COLLATION 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/normalizer2.h" 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ucol.h" 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ucoleitr.h" 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ubrk.h" 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define INITIAL_ARRAY_SIZE_ 256 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define MAX_TABLE_SIZE_ 257 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct USearch { 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // required since collation element iterator does not have a getText API 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *text; 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t textLength; // exact length 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isOverlap; 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isCanonicalMatch; 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int16_t elementComparisonType; 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBreakIterator *internalBreakIter; //internal character breakiterator 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBreakIterator *breakIter; 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // value USEARCH_DONE is the default value 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // if we are not at the start of the text or the end of the text, 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // depending on the iteration direction and matchedIndex is USEARCH_DONE 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // it means that we can't find any more matches in that particular direction 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t matchedIndex; 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t matchedLength; 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isForwardSearching; 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool reset; 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct UPattern { 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *text; 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t textLength; // exact length 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // length required for backwards ce comparison 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t CELength; 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t *CE; 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t PCELength; 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int64_t *PCE; 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool hasPrefixAccents; 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool hasSuffixAccents; 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int16_t defaultShiftSize; 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int16_t shift[MAX_TABLE_SIZE_]; 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int16_t backShift[MAX_TABLE_SIZE_]; 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct UStringSearch { 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) struct USearch *search; 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) struct UPattern pattern; 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UCollator *collator; 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const U_NAMESPACE_QUALIFIER Normalizer2 *nfd; 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // positions within the collation element iterator is used to determine 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // if we are at the start of the text. 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UCollationElements *textIter; 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // utility collation element, used throughout program for temporary 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // iteration. 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UCollationElements *utilIter; 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool ownCollator; 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UCollationStrength strength; 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t ceMask; 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t variableTop; 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool toShift; 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Exact matches without checking for the ends for extra accents. 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* The match after the position within the collation element iterator is to be 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* found. 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* After a match is found the offset in the collation element iterator will be 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* shifted to the start of the match. 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Implementation note: 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* For tertiary we can't use the collator->tertiaryMask, that is a 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* preprocessed mask that takes into account case options. since we are only 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* concerned with exact matches, we don't need that. 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Alternate handling - since only the 16 most significant digits is only used, 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* we can safely do a compare without masking if the ce is a variable, we mask 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* and get only the primary values no shifting to quartenary is required since 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* all primary values less than variabletop will need to be masked off anyway. 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* If the end character is composite and the pattern ce does not match the text 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* ce, we skip it until we find a match in the end composite character or when 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* it has passed the character. This is so that we can match pattern "a" with 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* the text "\u00e6" 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if an exact match is found, FALSE otherwise 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Canonical matches. 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* According to the definition, matches found here will include the whole span 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* of beginning and ending accents if it overlaps that region. 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if a canonical match is found, FALSE otherwise 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Gets the previous match. 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Comments follows from handleNextExact 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return True if a exact math is found, FALSE otherwise. 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Canonical matches. 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* According to the definition, matches found here will include the whole span 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* of beginning and ending accents if it overlaps that region. 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if a canonical match is found, FALSE otherwise 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status); 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif /* #if !UCONFIG_NO_COLLATION */ 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif 141