1/*
2**********************************************************************
3*   Copyright (C) 2001-2011 IBM and others. All rights reserved.
4**********************************************************************
5*   Date        Name        Description
6*  03/22/2000   helena      Creation.
7**********************************************************************
8*/
9
10#ifndef SEARCH_H
11#define SEARCH_H
12
13#include "unicode/utypes.h"
14
15/**
16 * \file
17 * \brief C++ API: SearchIterator object.
18 */
19
20#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
21
22#include "unicode/uobject.h"
23#include "unicode/unistr.h"
24#include "unicode/chariter.h"
25#include "unicode/brkiter.h"
26#include "unicode/usearch.h"
27
28/**
29* @stable ICU 2.0
30*/
31struct USearch;
32/**
33* @stable ICU 2.0
34*/
35typedef struct USearch USearch;
36
37U_NAMESPACE_BEGIN
38
39/**
40 *
41 * <tt>SearchIterator</tt> is an abstract base class that provides
42 * methods to search for a pattern within a text string. Instances of
43 * <tt>SearchIterator</tt> maintain a current position and scans over the
44 * target text, returning the indices the pattern is matched and the length
45 * of each match.
46 * <p>
47 * <tt>SearchIterator</tt> defines a protocol for text searching.
48 * Subclasses provide concrete implementations of various search algorithms.
49 * For example, <tt>StringSearch</tt> implements language-sensitive pattern
50 * matching based on the comparison rules defined in a
51 * <tt>RuleBasedCollator</tt> object.
52 * <p>
53 * Other options for searching includes using a BreakIterator to restrict
54 * the points at which matches are detected.
55 * <p>
56 * <tt>SearchIterator</tt> provides an API that is similar to that of
57 * other text iteration classes such as <tt>BreakIterator</tt>. Using
58 * this class, it is easy to scan through text looking for all occurances of
59 * a given pattern. The following example uses a <tt>StringSearch</tt>
60 * object to find all instances of "fox" in the target string. Any other
61 * subclass of <tt>SearchIterator</tt> can be used in an identical
62 * manner.
63 * <pre><code>
64 * UnicodeString target("The quick brown fox jumped over the lazy fox");
65 * UnicodeString pattern("fox");
66 *
67 * SearchIterator *iter  = new StringSearch(pattern, target);
68 * UErrorCode      error = U_ZERO_ERROR;
69 * for (int pos = iter->first(error); pos != USEARCH_DONE;
70 *                               pos = iter->next(error)) {
71 *     printf("Found match at %d pos, length is %d\n", pos,
72 *                                             iter.getMatchLength());
73 * }
74 * </code></pre>
75 *
76 * @see StringSearch
77 * @see RuleBasedCollator
78 */
79class U_I18N_API SearchIterator : public UObject {
80
81public:
82
83    // public constructors and destructors -------------------------------
84
85    /**
86    * Copy constructor that creates a SearchIterator instance with the same
87    * behavior, and iterating over the same text.
88    * @param other the SearchIterator instance to be copied.
89    * @stable ICU 2.0
90    */
91    SearchIterator(const SearchIterator &other);
92
93    /**
94     * Destructor. Cleans up the search iterator data struct.
95     * @stable ICU 2.0
96     */
97    virtual ~SearchIterator();
98
99    // public get and set methods ----------------------------------------
100
101    /**
102     * Sets the index to point to the given position, and clears any state
103     * that's affected.
104     * <p>
105     * This method takes the argument index and sets the position in the text
106     * string accordingly without checking if the index is pointing to a
107     * valid starting point to begin searching.
108     * @param position within the text to be set. If position is less
109     *             than or greater than the text range for searching,
110     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
111     * @param status for errors if it occurs
112     * @stable ICU 2.0
113     */
114    virtual void setOffset(int32_t position, UErrorCode &status) = 0;
115
116    /**
117     * Return the current index in the text being searched.
118     * If the iteration has gone past the end of the text
119     * (or past the beginning for a backwards search), USEARCH_DONE
120     * is returned.
121     * @return current index in the text being searched.
122     * @stable ICU 2.0
123     */
124    virtual int32_t getOffset(void) const = 0;
125
126    /**
127    * Sets the text searching attributes located in the enum
128    * USearchAttribute with values from the enum USearchAttributeValue.
129    * USEARCH_DEFAULT can be used for all attributes for resetting.
130    * @param attribute text attribute (enum USearchAttribute) to be set
131    * @param value text attribute value
132    * @param status for errors if it occurs
133    * @stable ICU 2.0
134    */
135    void setAttribute(USearchAttribute       attribute,
136                      USearchAttributeValue  value,
137                      UErrorCode            &status);
138
139    /**
140    * Gets the text searching attributes
141    * @param attribute text attribute (enum USearchAttribute) to be retrieve
142    * @return text attribute value
143    * @stable ICU 2.0
144    */
145    USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
146
147    /**
148    * Returns the index to the match in the text string that was searched.
149    * This call returns a valid result only after a successful call to
150    * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
151    * Just after construction, or after a searching method returns
152    * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
153    * <p>
154    * Use getMatchedLength to get the matched string length.
155    * @return index of a substring within the text string that is being
156    *         searched.
157    * @see #first
158    * @see #next
159    * @see #previous
160    * @see #last
161    * @stable ICU 2.0
162    */
163    int32_t getMatchedStart(void) const;
164
165    /**
166     * Returns the length of text in the string which matches the search
167     * pattern. This call returns a valid result only after a successful call
168     * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
169     * Just after construction, or after a searching method returns
170     * <tt>USEARCH_DONE</tt>, this method will return 0.
171     * @return The length of the match in the target text, or 0 if there
172     *         is no match currently.
173     * @see #first
174     * @see #next
175     * @see #previous
176     * @see #last
177     * @stable ICU 2.0
178     */
179    int32_t getMatchedLength(void) const;
180
181    /**
182     * Returns the text that was matched by the most recent call to
183     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
184     * If the iterator is not pointing at a valid match (e.g. just after
185     * construction or after <tt>USEARCH_DONE</tt> has been returned,
186     * returns an empty string.
187     * @param result stores the matched string or an empty string if a match
188     *        is not found.
189     * @see #first
190     * @see #next
191     * @see #previous
192     * @see #last
193     * @stable ICU 2.0
194     */
195    void getMatchedText(UnicodeString &result) const;
196
197    /**
198     * Set the BreakIterator that will be used to restrict the points
199     * at which matches are detected. The user is responsible for deleting
200     * the breakiterator.
201     * @param breakiter A BreakIterator that will be used to restrict the
202     *                points at which matches are detected. If a match is
203     *                found, but the match's start or end index is not a
204     *                boundary as determined by the <tt>BreakIterator</tt>,
205     *                the match will be rejected and another will be searched
206     *                for. If this parameter is <tt>NULL</tt>, no break
207     *                detection is attempted.
208     * @param status for errors if it occurs
209     * @see BreakIterator
210     * @stable ICU 2.0
211     */
212    void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
213
214    /**
215     * Returns the BreakIterator that is used to restrict the points at
216     * which matches are detected.  This will be the same object that was
217     * passed to the constructor or to <tt>setBreakIterator</tt>.
218     * Note that <tt>NULL</tt> is a legal value; it means that break
219     * detection should not be attempted.
220     * @return BreakIterator used to restrict matchings.
221     * @see #setBreakIterator
222     * @stable ICU 2.0
223     */
224    const BreakIterator * getBreakIterator(void) const;
225
226    /**
227     * Set the string text to be searched. Text iteration will hence begin at
228     * the start of the text string. This method is useful if you want to
229     * re-use an iterator to search for the same pattern within a different
230     * body of text. The user is responsible for deleting the text.
231     * @param text string to be searched.
232     * @param status for errors. If the text length is 0,
233     *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
234     * @stable ICU 2.0
235     */
236    virtual void setText(const UnicodeString &text, UErrorCode &status);
237
238    /**
239     * Set the string text to be searched. Text iteration will hence begin at
240     * the start of the text string. This method is useful if you want to
241     * re-use an iterator to search for the same pattern within a different
242     * body of text.
243     * <p>
244     * Note: No parsing of the text within the <tt>CharacterIterator</tt>
245     * will be done during searching for this version. The block of text
246     * in <tt>CharacterIterator</tt> will be used as it is.
247     * The user is responsible for deleting the text.
248     * @param text string iterator to be searched.
249     * @param status for errors if any. If the text length is 0 then an
250     *        U_ILLEGAL_ARGUMENT_ERROR is returned.
251     * @stable ICU 2.0
252     */
253    virtual void setText(CharacterIterator &text, UErrorCode &status);
254
255    /**
256     * Return the string text to be searched.
257     * @return text string to be searched.
258     * @stable ICU 2.0
259     */
260    const UnicodeString & getText(void) const;
261
262    // operator overloading ----------------------------------------------
263
264    /**
265     * Equality operator.
266     * @param that SearchIterator instance to be compared.
267     * @return TRUE if both BreakIterators are of the same class, have the
268     *         same behavior, terates over the same text and have the same
269     *         attributes. FALSE otherwise.
270     * @stable ICU 2.0
271     */
272    virtual UBool operator==(const SearchIterator &that) const;
273
274    /**
275     * Not-equal operator.
276     * @param that SearchIterator instance to be compared.
277     * @return FALSE if operator== returns TRUE, and vice versa.
278     * @stable ICU 2.0
279     */
280    UBool operator!=(const SearchIterator &that) const;
281
282    // public methods ----------------------------------------------------
283
284    /**
285     * Returns a copy of SearchIterator with the same behavior, and
286     * iterating over the same text, as this one. Note that all data will be
287     * replicated, except for the text string to be searched.
288     * @return cloned object
289     * @stable ICU 2.0
290     */
291    virtual SearchIterator* safeClone(void) const = 0;
292
293    /**
294     * Returns the first index at which the string text matches the search
295     * pattern. The iterator is adjusted so that its current index (as
296     * returned by <tt>getOffset</tt>) is the match position if one
297     * was found.
298     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
299     * the iterator will be adjusted to the index USEARCH_DONE
300     * @param  status for errors if it occurs
301     * @return The character index of the first match, or
302     *         <tt>USEARCH_DONE</tt> if there are no matches.
303     * @see #getOffset
304     * @stable ICU 2.0
305     */
306    int32_t first(UErrorCode &status);
307
308    /**
309     * Returns the first index equal or greater than <tt>position</tt> at which the
310     * string text matches the search pattern. The iterator is adjusted so
311     * that its current index (as returned by <tt>getOffset</tt>) is the
312     * match position if one was found.
313     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
314     * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
315     * @param  position where search if to start from. If position is less
316     *             than or greater than the text range for searching,
317     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
318     * @param  status for errors if it occurs
319     * @return The character index of the first match following
320     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
321     *         matches.
322     * @see #getOffset
323     * @stable ICU 2.0
324     */
325    int32_t following(int32_t position, UErrorCode &status);
326
327    /**
328     * Returns the last index in the target text at which it matches the
329     * search pattern. The iterator is adjusted so that its current index
330     * (as returned by <tt>getOffset</tt>) is the match position if one was
331     * found.
332     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
333     * the iterator will be adjusted to the index USEARCH_DONE.
334     * @param  status for errors if it occurs
335     * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
336     *         there are no matches.
337     * @see #getOffset
338     * @stable ICU 2.0
339     */
340    int32_t last(UErrorCode &status);
341
342    /**
343     * Returns the first index less than <tt>position</tt> at which the string
344     * text matches the search pattern. The iterator is adjusted so that its
345     * current index (as returned by <tt>getOffset</tt>) is the match
346     * position if one was found. If a match is not found,
347     * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
348     * adjusted to the index USEARCH_DONE
349     * <p>
350     * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
351     * result match is always less than <tt>position</tt>.
352     * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
353     * <tt>position</tt>.
354     *
355     * @param  position where search is to start from. If position is less
356     *             than or greater than the text range for searching,
357     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
358     * @param  status for errors if it occurs
359     * @return The character index of the first match preceding
360     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
361     *         no matches.
362     * @see #getOffset
363     * @stable ICU 2.0
364     */
365    int32_t preceding(int32_t position, UErrorCode &status);
366
367    /**
368     * Returns the index of the next point at which the text matches the
369     * search pattern, starting from the current position
370     * The iterator is adjusted so that its current index (as returned by
371     * <tt>getOffset</tt>) is the match position if one was found.
372     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
373     * the iterator will be adjusted to a position after the end of the text
374     * string.
375     * @param  status for errors if it occurs
376     * @return The index of the next match after the current position,
377     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
378     * @see #getOffset
379     * @stable ICU 2.0
380     */
381     int32_t next(UErrorCode &status);
382
383    /**
384     * Returns the index of the previous point at which the string text
385     * matches the search pattern, starting at the current position.
386     * The iterator is adjusted so that its current index (as returned by
387     * <tt>getOffset</tt>) is the match position if one was found.
388     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
389     * the iterator will be adjusted to the index USEARCH_DONE
390     * @param  status for errors if it occurs
391     * @return The index of the previous match before the current position,
392     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
393     * @see #getOffset
394     * @stable ICU 2.0
395     */
396    int32_t previous(UErrorCode &status);
397
398    /**
399    * Resets the iteration.
400    * Search will begin at the start of the text string if a forward
401    * iteration is initiated before a backwards iteration. Otherwise if a
402    * backwards iteration is initiated before a forwards iteration, the
403    * search will begin at the end of the text string.
404    * @stable ICU 2.0
405    */
406    virtual void reset();
407
408protected:
409    // protected data members ---------------------------------------------
410
411    /**
412    * C search data struct
413    * @stable ICU 2.0
414    */
415    USearch *m_search_;
416
417    /**
418    * Break iterator.
419    * Currently the C++ breakiterator does not have getRules etc to reproduce
420    * another in C. Hence we keep the original around and do the verification
421    * at the end of the match. The user is responsible for deleting this
422    * break iterator.
423    * @stable ICU 2.0
424    */
425    BreakIterator *m_breakiterator_;
426
427    /**
428    * Unicode string version of the search text
429    * @stable ICU 2.0
430    */
431    UnicodeString  m_text_;
432
433    // protected constructors and destructors -----------------------------
434
435    /**
436    * Default constructor.
437    * Initializes data to the default values.
438    * @stable ICU 2.0
439    */
440    SearchIterator();
441
442    /**
443     * Constructor for use by subclasses.
444     * @param text The target text to be searched.
445     * @param breakiter A {@link BreakIterator} that is used to restrict the
446     *                points at which matches are detected. If
447     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
448     *                match, but the match's start or end index is not a
449     *                boundary as determined by the <tt>BreakIterator</tt>,
450     *                the match is rejected and <tt>handleNext</tt> or
451     *                <tt>handlePrev</tt> is called again. If this parameter
452     *                is <tt>NULL</tt>, no break detection is attempted.
453     * @see #handleNext
454     * @see #handlePrev
455     * @stable ICU 2.0
456     */
457    SearchIterator(const UnicodeString &text,
458                         BreakIterator *breakiter = NULL);
459
460    /**
461     * Constructor for use by subclasses.
462     * <p>
463     * Note: No parsing of the text within the <tt>CharacterIterator</tt>
464     * will be done during searching for this version. The block of text
465     * in <tt>CharacterIterator</tt> will be used as it is.
466     * @param text The target text to be searched.
467     * @param breakiter A {@link BreakIterator} that is used to restrict the
468     *                points at which matches are detected. If
469     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
470     *                match, but the match's start or end index is not a
471     *                boundary as determined by the <tt>BreakIterator</tt>,
472     *                the match is rejected and <tt>handleNext</tt> or
473     *                <tt>handlePrev</tt> is called again. If this parameter
474     *                is <tt>NULL</tt>, no break detection is attempted.
475     * @see #handleNext
476     * @see #handlePrev
477     * @stable ICU 2.0
478     */
479    SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
480
481    // protected methods --------------------------------------------------
482
483    /**
484     * Assignment operator. Sets this iterator to have the same behavior,
485     * and iterate over the same text, as the one passed in.
486     * @param that instance to be copied.
487     * @stable ICU 2.0
488     */
489    SearchIterator & operator=(const SearchIterator &that);
490
491    /**
492     * Abstract method which subclasses override to provide the mechanism
493     * for finding the next match in the target text. This allows different
494     * subclasses to provide different search algorithms.
495     * <p>
496     * If a match is found, the implementation should return the index at
497     * which the match starts and should call
498     * <tt>setMatchLength</tt> with the number of characters
499     * in the target text that make up the match. If no match is found, the
500     * method should return USEARCH_DONE.
501     * <p>
502     * @param position The index in the target text at which the search
503     *                 should start.
504     * @param status for error codes if it occurs.
505     * @return index at which the match starts, else if match is not found
506     *         USEARCH_DONE is returned
507     * @see #setMatchLength
508     * @stable ICU 2.0
509     */
510    virtual int32_t handleNext(int32_t position, UErrorCode &status)
511                                                                         = 0;
512
513    /**
514     * Abstract method which subclasses override to provide the mechanism for
515     * finding the previous match in the target text. This allows different
516     * subclasses to provide different search algorithms.
517     * <p>
518     * If a match is found, the implementation should return the index at
519     * which the match starts and should call
520     * <tt>setMatchLength</tt> with the number of characters
521     * in the target text that make up the match. If no match is found, the
522     * method should return USEARCH_DONE.
523     * <p>
524     * @param position The index in the target text at which the search
525     *                 should start.
526     * @param status for error codes if it occurs.
527     * @return index at which the match starts, else if match is not found
528     *         USEARCH_DONE is returned
529     * @see #setMatchLength
530     * @stable ICU 2.0
531     */
532     virtual int32_t handlePrev(int32_t position, UErrorCode &status)
533                                                                         = 0;
534
535    /**
536     * Sets the length of the currently matched string in the text string to
537     * be searched.
538     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
539     * methods should call this when they find a match in the target text.
540     * @param length length of the matched text.
541     * @see #handleNext
542     * @see #handlePrev
543     * @stable ICU 2.0
544     */
545    virtual void setMatchLength(int32_t length);
546
547    /**
548     * Sets the offset of the currently matched string in the text string to
549     * be searched.
550     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
551     * methods should call this when they find a match in the target text.
552     * @param position start offset of the matched text.
553     * @see #handleNext
554     * @see #handlePrev
555     * @stable ICU 2.0
556     */
557    virtual void setMatchStart(int32_t position);
558
559    /**
560    * sets match not found
561    * @stable ICU 2.0
562    */
563    void setMatchNotFound();
564};
565
566inline UBool SearchIterator::operator!=(const SearchIterator &that) const
567{
568   return !operator==(that);
569}
570U_NAMESPACE_END
571
572#endif /* #if !UCONFIG_NO_COLLATION */
573
574#endif
575
576