16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2001-2011 IBM and others. All rights reserved.
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Date        Name        Description
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*  08/13/2001   synwee      Creation.
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef USRCHIMP_H
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define USRCHIMP_H
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_COLLATION
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normalizer2.h"
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucol.h"
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucoleitr.h"
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ubrk.h"
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define INITIAL_ARRAY_SIZE_       256
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define MAX_TABLE_SIZE_           257
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct USearch {
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // required since collation element iterator does not have a getText API
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UChar              *text;
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             textLength; // exact length
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               isOverlap;
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               isCanonicalMatch;
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int16_t             elementComparisonType;
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBreakIterator     *internalBreakIter;  //internal character breakiterator
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBreakIterator     *breakIter;
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // value USEARCH_DONE is the default value
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // if we are not at the start of the text or the end of the text,
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // depending on the iteration direction and matchedIndex is USEARCH_DONE
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // it means that we can't find any more matches in that particular direction
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             matchedIndex;
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             matchedLength;
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               isForwardSearching;
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               reset;
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct UPattern {
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UChar              *text;
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             textLength; // exact length
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          // length required for backwards ce comparison
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             CELength;
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t            *CE;
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t             PCELength;
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int64_t            *PCE;
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               hasPrefixAccents;
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          UBool               hasSuffixAccents;
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int16_t             defaultShiftSize;
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int16_t             shift[MAX_TABLE_SIZE_];
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int16_t             backShift[MAX_TABLE_SIZE_];
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct UStringSearch {
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    struct USearch            *search;
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    struct UPattern            pattern;
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const  UCollator          *collator;
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const  icu::Normalizer2   *nfd;
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // positions within the collation element iterator is used to determine
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // if we are at the start of the text.
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UCollationElements *textIter;
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // utility collation element, used throughout program for temporary
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // iteration.
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UCollationElements *utilIter;
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UBool               ownCollator;
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UCollationStrength  strength;
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           uint32_t            ceMask;
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           uint32_t            variableTop;
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UBool               toShift;
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Exact matches without checking for the ends for extra accents.
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* The match after the position within the collation element iterator is to be
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* found.
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* After a match is found the offset in the collation element iterator will be
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* shifted to the start of the match.
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Implementation note:
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* For tertiary we can't use the collator->tertiaryMask, that is a
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* preprocessed mask that takes into account case options. since we are only
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* concerned with exact matches, we don't need that.
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Alternate handling - since only the 16 most significant digits is only used,
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* we can safely do a compare without masking if the ce is a variable, we mask
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* and get only the primary values no shifting to quartenary is required since
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* all primary values less than variabletop will need to be masked off anyway.
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* If the end character is composite and the pattern ce does not match the text
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* ce, we skip it until we find a match in the end composite character or when
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* it has passed the character. This is so that we can match pattern "a" with
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* the text "\u00e6"
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if an exact match is found, FALSE otherwise
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Canonical matches.
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* According to the definition, matches found here will include the whole span
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* of beginning and ending accents if it overlaps that region.
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if a canonical match is found, FALSE otherwise
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Gets the previous match.
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Comments follows from handleNextExact
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return True if a exact math is found, FALSE otherwise.
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Canonical matches.
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* According to the definition, matches found here will include the whole span
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* of beginning and ending accents if it overlaps that region.
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param strsrch string search data
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @param status error status if any
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* @return TRUE if a canonical match is found, FALSE otherwise
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_CFUNC
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                      UErrorCode    *status);
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_COLLATION */
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
141