1/*
2**********************************************************************
3*   Copyright (C) 2001-2010 IBM and others. All rights reserved.
4**********************************************************************
5*   Date        Name        Description
6*  08/13/2001   synwee      Creation.
7**********************************************************************
8*/
9#ifndef USRCHIMP_H
10#define USRCHIMP_H
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/normalizer2.h"
17#include "unicode/ucol.h"
18#include "unicode/ucoleitr.h"
19#include "unicode/ubrk.h"
20
21#define INITIAL_ARRAY_SIZE_       256
22#define MAX_TABLE_SIZE_           257
23
24struct USearch {
25    // required since collation element iterator does not have a getText API
26    const UChar              *text;
27          int32_t             textLength; // exact length
28          UBool               isOverlap;
29          UBool               isCanonicalMatch;
30          int16_t             elementComparisonType;
31          UBreakIterator     *internalBreakIter;  //internal character breakiterator
32          UBreakIterator     *breakIter;
33    // value USEARCH_DONE is the default value
34    // if we are not at the start of the text or the end of the text,
35    // depending on the iteration direction and matchedIndex is USEARCH_DONE
36    // it means that we can't find any more matches in that particular direction
37          int32_t             matchedIndex;
38          int32_t             matchedLength;
39          UBool               isForwardSearching;
40          UBool               reset;
41};
42
43struct UPattern {
44    const UChar              *text;
45          int32_t             textLength; // exact length
46          // length required for backwards ce comparison
47          int32_t             CELength;
48          int32_t            *CE;
49          int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
50          int32_t             PCELength;
51          int64_t            *PCE;
52          int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
53          UBool               hasPrefixAccents;
54          UBool               hasSuffixAccents;
55          int16_t             defaultShiftSize;
56          int16_t             shift[MAX_TABLE_SIZE_];
57          int16_t             backShift[MAX_TABLE_SIZE_];
58};
59
60struct UStringSearch {
61    struct USearch            *search;
62    struct UPattern            pattern;
63    const  UCollator          *collator;
64    const  U_NAMESPACE_QUALIFIER Normalizer2 *nfd;
65    // positions within the collation element iterator is used to determine
66    // if we are at the start of the text.
67           UCollationElements *textIter;
68    // utility collation element, used throughout program for temporary
69    // iteration.
70           UCollationElements *utilIter;
71           UBool               ownCollator;
72           UCollationStrength  strength;
73           uint32_t            ceMask;
74           uint32_t            variableTop;
75           UBool               toShift;
76           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
77           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
78};
79
80/**
81* Exact matches without checking for the ends for extra accents.
82* The match after the position within the collation element iterator is to be
83* found.
84* After a match is found the offset in the collation element iterator will be
85* shifted to the start of the match.
86* Implementation note:
87* For tertiary we can't use the collator->tertiaryMask, that is a
88* preprocessed mask that takes into account case options. since we are only
89* concerned with exact matches, we don't need that.
90* Alternate handling - since only the 16 most significant digits is only used,
91* we can safely do a compare without masking if the ce is a variable, we mask
92* and get only the primary values no shifting to quartenary is required since
93* all primary values less than variabletop will need to be masked off anyway.
94* If the end character is composite and the pattern ce does not match the text
95* ce, we skip it until we find a match in the end composite character or when
96* it has passed the character. This is so that we can match pattern "a" with
97* the text "\u00e6"
98* @param strsrch string search data
99* @param status error status if any
100* @return TRUE if an exact match is found, FALSE otherwise
101*/
102U_CFUNC
103UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
104
105/**
106* Canonical matches.
107* According to the definition, matches found here will include the whole span
108* of beginning and ending accents if it overlaps that region.
109* @param strsrch string search data
110* @param status error status if any
111* @return TRUE if a canonical match is found, FALSE otherwise
112*/
113U_CFUNC
114UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
115
116/**
117* Gets the previous match.
118* Comments follows from handleNextExact
119* @param strsrch string search data
120* @param status error status if any
121* @return True if a exact math is found, FALSE otherwise.
122*/
123U_CFUNC
124UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
125
126/**
127* Canonical matches.
128* According to the definition, matches found here will include the whole span
129* of beginning and ending accents if it overlaps that region.
130* @param strsrch string search data
131* @param status error status if any
132* @return TRUE if a canonical match is found, FALSE otherwise
133*/
134U_CFUNC
135UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
136                                      UErrorCode    *status);
137
138#endif /* #if !UCONFIG_NO_COLLATION */
139
140#endif
141