16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2001-2011 IBM and others. All rights reserved. 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Date Name Description 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 08/13/2001 synwee Creation. 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef USRCHIMP_H 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define USRCHIMP_H 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_COLLATION 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normalizer2.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucol.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucoleitr.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ubrk.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define INITIAL_ARRAY_SIZE_ 256 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define MAX_TABLE_SIZE_ 257 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct USearch { 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // required since collation element iterator does not have a getText API 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *text; 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t textLength; // exact length 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isOverlap; 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isCanonicalMatch; 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int16_t elementComparisonType; 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBreakIterator *internalBreakIter; //internal character breakiterator 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBreakIterator *breakIter; 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // value USEARCH_DONE is the default value 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if we are not at the start of the text or the end of the text, 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // depending on the iteration direction and matchedIndex is USEARCH_DONE 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // it means that we can't find any more matches in that particular direction 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t matchedIndex; 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t matchedLength; 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isForwardSearching; 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool reset; 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct UPattern { 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *text; 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t textLength; // exact length 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // length required for backwards ce comparison 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t CELength; 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t *CE; 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t PCELength; 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int64_t *PCE; 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool hasPrefixAccents; 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool hasSuffixAccents; 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int16_t defaultShiftSize; 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int16_t shift[MAX_TABLE_SIZE_]; 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int16_t backShift[MAX_TABLE_SIZE_]; 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct UStringSearch { 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org struct USearch *search; 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org struct UPattern pattern; 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCollator *collator; 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const icu::Normalizer2 *nfd; 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // positions within the collation element iterator is used to determine 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if we are at the start of the text. 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCollationElements *textIter; 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // utility collation element, used throughout program for temporary 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // iteration. 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCollationElements *utilIter; 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool ownCollator; 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCollationStrength strength; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t ceMask; 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t variableTop; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool toShift; 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Exact matches without checking for the ends for extra accents. 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* The match after the position within the collation element iterator is to be 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* found. 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* After a match is found the offset in the collation element iterator will be 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* shifted to the start of the match. 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Implementation note: 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* For tertiary we can't use the collator->tertiaryMask, that is a 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* preprocessed mask that takes into account case options. since we are only 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* concerned with exact matches, we don't need that. 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Alternate handling - since only the 16 most significant digits is only used, 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* we can safely do a compare without masking if the ce is a variable, we mask 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* and get only the primary values no shifting to quartenary is required since 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* all primary values less than variabletop will need to be masked off anyway. 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* If the end character is composite and the pattern ce does not match the text 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* ce, we skip it until we find a match in the end composite character or when 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* it has passed the character. This is so that we can match pattern "a" with 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* the text "\u00e6" 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if an exact match is found, FALSE otherwise 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Canonical matches. 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* According to the definition, matches found here will include the whole span 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* of beginning and ending accents if it overlaps that region. 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if a canonical match is found, FALSE otherwise 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Gets the previous match. 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Comments follows from handleNextExact 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return True if a exact math is found, FALSE otherwise. 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Canonical matches. 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* According to the definition, matches found here will include the whole span 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* of beginning and ending accents if it overlaps that region. 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if a canonical match is found, FALSE otherwise 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode *status); 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_COLLATION */ 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 141