1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius* Copyright (C) 2001-2011 IBM and others. All rights reserved. 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 08/13/2001 synwee Creation. 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifndef USRCHIMP_H 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define USRCHIMP_H 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_COLLATION 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h" 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucol.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucoleitr.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ubrk.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define INITIAL_ARRAY_SIZE_ 256 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define MAX_TABLE_SIZE_ 257 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct USearch { 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // required since collation element iterator does not have a getText API 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text; 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textLength; // exact length 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isOverlap; 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isCanonicalMatch; 3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int16_t elementComparisonType; 3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBreakIterator *internalBreakIter; //internal character breakiterator 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBreakIterator *breakIter; 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // value USEARCH_DONE is the default value 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are not at the start of the text or the end of the text, 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // depending on the iteration direction and matchedIndex is USEARCH_DONE 36c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // it means that we can't find any more matches in that particular direction 37c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t matchedIndex; 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchedLength; 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isForwardSearching; 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool reset; 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UPattern { 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UChar *text; 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t textLength; // exact length 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // length required for backwards ce comparison 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t CELength; 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t *CE; 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; 50c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t PCELength; 51c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t *PCE; 52c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool hasPrefixAccents; 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool hasSuffixAccents; 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t defaultShiftSize; 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t shift[MAX_TABLE_SIZE_]; 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int16_t backShift[MAX_TABLE_SIZE_]; 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UStringSearch { 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru struct USearch *search; 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru struct UPattern pattern; 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCollator *collator; 6483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius const icu::Normalizer2 *nfd; 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // positions within the collation element iterator is used to determine 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if we are at the start of the text. 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *textIter; 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // utility collation element, used throughout program for temporary 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // iteration. 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationElements *utilIter; 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool ownCollator; 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCollationStrength strength; 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t ceMask; 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t variableTop; 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool toShift; 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Exact matches without checking for the ends for extra accents. 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The match after the position within the collation element iterator is to be 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* found. 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* After a match is found the offset in the collation element iterator will be 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shifted to the start of the match. 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Implementation note: 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* For tertiary we can't use the collator->tertiaryMask, that is a 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* preprocessed mask that takes into account case options. since we are only 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* concerned with exact matches, we don't need that. 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Alternate handling - since only the 16 most significant digits is only used, 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* we can safely do a compare without masking if the ce is a variable, we mask 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* and get only the primary values no shifting to quartenary is required since 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* all primary values less than variabletop will need to be masked off anyway. 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the end character is composite and the pattern ce does not match the text 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ce, we skip it until we find a match in the end composite character or when 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* it has passed the character. This is so that we can match pattern "a" with 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the text "\u00e6" 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if an exact match is found, FALSE otherwise 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Canonical matches. 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* According to the definition, matches found here will include the whole span 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of beginning and ending accents if it overlaps that region. 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a canonical match is found, FALSE otherwise 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the previous match. 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Comments follows from handleNextExact 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return True if a exact math is found, FALSE otherwise. 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Canonical matches. 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* According to the definition, matches found here will include the whole span 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of beginning and ending accents if it overlaps that region. 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a canonical match is found, FALSE otherwise 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *status); 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_COLLATION */ 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 141