1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/*
2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius*   Copyright (C) 2001-2011 IBM and others. All rights reserved.
4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   Date        Name        Description
6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*  08/13/2001   synwee      Creation.
7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifndef USRCHIMP_H
10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define USRCHIMP_H
11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h"
13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_COLLATION
15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h"
17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucol.h"
18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucoleitr.h"
19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ubrk.h"
20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define INITIAL_ARRAY_SIZE_       256
22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define MAX_TABLE_SIZE_           257
23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct USearch {
25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // required since collation element iterator does not have a getText API
26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UChar              *text;
27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t             textLength; // exact length
28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               isOverlap;
29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               isCanonicalMatch;
3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho          int16_t             elementComparisonType;
3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho          UBreakIterator     *internalBreakIter;  //internal character breakiterator
32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBreakIterator     *breakIter;
33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // value USEARCH_DONE is the default value
34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // if we are not at the start of the text or the end of the text,
35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // depending on the iteration direction and matchedIndex is USEARCH_DONE
36c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    // it means that we can't find any more matches in that particular direction
37c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru          int32_t             matchedIndex;
38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t             matchedLength;
39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               isForwardSearching;
40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               reset;
41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru};
42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UPattern {
44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UChar              *text;
45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t             textLength; // exact length
46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          // length required for backwards ce comparison
47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t             CELength;
48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t            *CE;
49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
50c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru          int32_t             PCELength;
51c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru          int64_t            *PCE;
52c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru          int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               hasPrefixAccents;
54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          UBool               hasSuffixAccents;
55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int16_t             defaultShiftSize;
56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int16_t             shift[MAX_TABLE_SIZE_];
57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru          int16_t             backShift[MAX_TABLE_SIZE_];
58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru};
59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct UStringSearch {
61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    struct USearch            *search;
62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    struct UPattern            pattern;
63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const  UCollator          *collator;
6483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    const  icu::Normalizer2   *nfd;
65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // positions within the collation element iterator is used to determine
66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // if we are at the start of the text.
67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UCollationElements *textIter;
68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // utility collation element, used throughout program for temporary
69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // iteration.
70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UCollationElements *utilIter;
71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UBool               ownCollator;
72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UCollationStrength  strength;
73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           uint32_t            ceMask;
74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           uint32_t            variableTop;
75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UBool               toShift;
76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru};
79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Exact matches without checking for the ends for extra accents.
82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* The match after the position within the collation element iterator is to be
83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* found.
84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* After a match is found the offset in the collation element iterator will be
85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* shifted to the start of the match.
86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Implementation note:
87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* For tertiary we can't use the collator->tertiaryMask, that is a
88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* preprocessed mask that takes into account case options. since we are only
89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* concerned with exact matches, we don't need that.
90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Alternate handling - since only the 16 most significant digits is only used,
91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* we can safely do a compare without masking if the ce is a variable, we mask
92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* and get only the primary values no shifting to quartenary is required since
93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* all primary values less than variabletop will need to be masked off anyway.
94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* If the end character is composite and the pattern ce does not match the text
95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* ce, we skip it until we find a match in the end composite character or when
96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* it has passed the character. This is so that we can match pattern "a" with
97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* the text "\u00e6"
98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data
99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any
100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if an exact match is found, FALSE otherwise
101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC
103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Canonical matches.
107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* According to the definition, matches found here will include the whole span
108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of beginning and ending accents if it overlaps that region.
109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data
110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any
111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a canonical match is found, FALSE otherwise
112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC
114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Gets the previous match.
118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Comments follows from handleNextExact
119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data
120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any
121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return True if a exact math is found, FALSE otherwise.
122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC
124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Canonical matches.
128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* According to the definition, matches found here will include the whole span
129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* of beginning and ending accents if it overlaps that region.
130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param strsrch string search data
131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @param status error status if any
132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* @return TRUE if a canonical match is found, FALSE otherwise
133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CFUNC
135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                      UErrorCode    *status);
137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_COLLATION */
139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
141