1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)**********************************************************************
3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   Copyright (C) 2001-2010 IBM and others. All rights reserved.
4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)**********************************************************************
5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   Date        Name        Description
6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*  08/13/2001   synwee      Creation.
7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)**********************************************************************
8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#ifndef USRCHIMP_H
10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define USRCHIMP_H
11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h"
13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if !UCONFIG_NO_COLLATION
15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/normalizer2.h"
17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ucol.h"
18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ucoleitr.h"
19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ubrk.h"
20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define INITIAL_ARRAY_SIZE_       256
22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define MAX_TABLE_SIZE_           257
23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct USearch {
25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // required since collation element iterator does not have a getText API
26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    const UChar              *text;
27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             textLength; // exact length
28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               isOverlap;
29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               isCanonicalMatch;
30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int16_t             elementComparisonType;
31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBreakIterator     *internalBreakIter;  //internal character breakiterator
32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBreakIterator     *breakIter;
33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // value USEARCH_DONE is the default value
34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // if we are not at the start of the text or the end of the text,
35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // depending on the iteration direction and matchedIndex is USEARCH_DONE
36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // it means that we can't find any more matches in that particular direction
37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             matchedIndex;
38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             matchedLength;
39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               isForwardSearching;
40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               reset;
41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)};
42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct UPattern {
44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    const UChar              *text;
45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             textLength; // exact length
46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          // length required for backwards ce comparison
47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             CELength;
48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t            *CE;
49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int32_t             PCELength;
51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int64_t            *PCE;
52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               hasPrefixAccents;
54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          UBool               hasSuffixAccents;
55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int16_t             defaultShiftSize;
56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int16_t             shift[MAX_TABLE_SIZE_];
57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          int16_t             backShift[MAX_TABLE_SIZE_];
58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)};
59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)struct UStringSearch {
61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    struct USearch            *search;
62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    struct UPattern            pattern;
63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    const  UCollator          *collator;
64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    const  U_NAMESPACE_QUALIFIER Normalizer2 *nfd;
65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // positions within the collation element iterator is used to determine
66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // if we are at the start of the text.
67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UCollationElements *textIter;
68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // utility collation element, used throughout program for temporary
69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // iteration.
70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UCollationElements *utilIter;
71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UBool               ownCollator;
72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UCollationStrength  strength;
73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           uint32_t            ceMask;
74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           uint32_t            variableTop;
75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UBool               toShift;
76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)};
79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/**
81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Exact matches without checking for the ends for extra accents.
82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* The match after the position within the collation element iterator is to be
83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* found.
84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* After a match is found the offset in the collation element iterator will be
85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* shifted to the start of the match.
86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Implementation note:
87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* For tertiary we can't use the collator->tertiaryMask, that is a
88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* preprocessed mask that takes into account case options. since we are only
89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* concerned with exact matches, we don't need that.
90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Alternate handling - since only the 16 most significant digits is only used,
91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* we can safely do a compare without masking if the ce is a variable, we mask
92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* and get only the primary values no shifting to quartenary is required since
93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* all primary values less than variabletop will need to be masked off anyway.
94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* If the end character is composite and the pattern ce does not match the text
95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* ce, we skip it until we find a match in the end composite character or when
96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* it has passed the character. This is so that we can match pattern "a" with
97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* the text "\u00e6"
98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data
99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any
100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if an exact match is found, FALSE otherwise
101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC
103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/**
106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Canonical matches.
107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* According to the definition, matches found here will include the whole span
108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* of beginning and ending accents if it overlaps that region.
109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data
110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any
111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if a canonical match is found, FALSE otherwise
112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC
114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/**
117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Gets the previous match.
118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Comments follows from handleNextExact
119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data
120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any
121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return True if a exact math is found, FALSE otherwise.
122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC
124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/**
127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Canonical matches.
128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* According to the definition, matches found here will include the whole span
129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* of beginning and ending accents if it overlaps that region.
130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param strsrch string search data
131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @param status error status if any
132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* @return TRUE if a canonical match is found, FALSE otherwise
133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC
135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                                      UErrorCode    *status);
137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif /* #if !UCONFIG_NO_COLLATION */
139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif
141