1/* 2********************************************************************** 3* Copyright (C) 2001-2011 IBM and others. All rights reserved. 4********************************************************************** 5* Date Name Description 6* 03/22/2000 helena Creation. 7********************************************************************** 8*/ 9 10#ifndef SEARCH_H 11#define SEARCH_H 12 13#include "unicode/utypes.h" 14 15/** 16 * \file 17 * \brief C++ API: SearchIterator object. 18 */ 19 20#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 21 22#include "unicode/uobject.h" 23#include "unicode/unistr.h" 24#include "unicode/chariter.h" 25#include "unicode/brkiter.h" 26#include "unicode/usearch.h" 27 28/** 29* @stable ICU 2.0 30*/ 31struct USearch; 32/** 33* @stable ICU 2.0 34*/ 35typedef struct USearch USearch; 36 37U_NAMESPACE_BEGIN 38 39/** 40 * 41 * <tt>SearchIterator</tt> is an abstract base class that provides 42 * methods to search for a pattern within a text string. Instances of 43 * <tt>SearchIterator</tt> maintain a current position and scans over the 44 * target text, returning the indices the pattern is matched and the length 45 * of each match. 46 * <p> 47 * <tt>SearchIterator</tt> defines a protocol for text searching. 48 * Subclasses provide concrete implementations of various search algorithms. 49 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 50 * matching based on the comparison rules defined in a 51 * <tt>RuleBasedCollator</tt> object. 52 * <p> 53 * Other options for searching includes using a BreakIterator to restrict 54 * the points at which matches are detected. 55 * <p> 56 * <tt>SearchIterator</tt> provides an API that is similar to that of 57 * other text iteration classes such as <tt>BreakIterator</tt>. Using 58 * this class, it is easy to scan through text looking for all occurances of 59 * a given pattern. The following example uses a <tt>StringSearch</tt> 60 * object to find all instances of "fox" in the target string. Any other 61 * subclass of <tt>SearchIterator</tt> can be used in an identical 62 * manner. 63 * <pre><code> 64 * UnicodeString target("The quick brown fox jumped over the lazy fox"); 65 * UnicodeString pattern("fox"); 66 * 67 * SearchIterator *iter = new StringSearch(pattern, target); 68 * UErrorCode error = U_ZERO_ERROR; 69 * for (int pos = iter->first(error); pos != USEARCH_DONE; 70 * pos = iter->next(error)) { 71 * printf("Found match at %d pos, length is %d\n", pos, 72 * iter.getMatchLength()); 73 * } 74 * </code></pre> 75 * 76 * @see StringSearch 77 * @see RuleBasedCollator 78 */ 79class U_I18N_API SearchIterator : public UObject { 80 81public: 82 83 // public constructors and destructors ------------------------------- 84 85 /** 86 * Copy constructor that creates a SearchIterator instance with the same 87 * behavior, and iterating over the same text. 88 * @param other the SearchIterator instance to be copied. 89 * @stable ICU 2.0 90 */ 91 SearchIterator(const SearchIterator &other); 92 93 /** 94 * Destructor. Cleans up the search iterator data struct. 95 * @stable ICU 2.0 96 */ 97 virtual ~SearchIterator(); 98 99 // public get and set methods ---------------------------------------- 100 101 /** 102 * Sets the index to point to the given position, and clears any state 103 * that's affected. 104 * <p> 105 * This method takes the argument index and sets the position in the text 106 * string accordingly without checking if the index is pointing to a 107 * valid starting point to begin searching. 108 * @param position within the text to be set. If position is less 109 * than or greater than the text range for searching, 110 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 111 * @param status for errors if it occurs 112 * @stable ICU 2.0 113 */ 114 virtual void setOffset(int32_t position, UErrorCode &status) = 0; 115 116 /** 117 * Return the current index in the text being searched. 118 * If the iteration has gone past the end of the text 119 * (or past the beginning for a backwards search), USEARCH_DONE 120 * is returned. 121 * @return current index in the text being searched. 122 * @stable ICU 2.0 123 */ 124 virtual int32_t getOffset(void) const = 0; 125 126 /** 127 * Sets the text searching attributes located in the enum 128 * USearchAttribute with values from the enum USearchAttributeValue. 129 * USEARCH_DEFAULT can be used for all attributes for resetting. 130 * @param attribute text attribute (enum USearchAttribute) to be set 131 * @param value text attribute value 132 * @param status for errors if it occurs 133 * @stable ICU 2.0 134 */ 135 void setAttribute(USearchAttribute attribute, 136 USearchAttributeValue value, 137 UErrorCode &status); 138 139 /** 140 * Gets the text searching attributes 141 * @param attribute text attribute (enum USearchAttribute) to be retrieve 142 * @return text attribute value 143 * @stable ICU 2.0 144 */ 145 USearchAttributeValue getAttribute(USearchAttribute attribute) const; 146 147 /** 148 * Returns the index to the match in the text string that was searched. 149 * This call returns a valid result only after a successful call to 150 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 151 * Just after construction, or after a searching method returns 152 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 153 * <p> 154 * Use getMatchedLength to get the matched string length. 155 * @return index of a substring within the text string that is being 156 * searched. 157 * @see #first 158 * @see #next 159 * @see #previous 160 * @see #last 161 * @stable ICU 2.0 162 */ 163 int32_t getMatchedStart(void) const; 164 165 /** 166 * Returns the length of text in the string which matches the search 167 * pattern. This call returns a valid result only after a successful call 168 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 169 * Just after construction, or after a searching method returns 170 * <tt>USEARCH_DONE</tt>, this method will return 0. 171 * @return The length of the match in the target text, or 0 if there 172 * is no match currently. 173 * @see #first 174 * @see #next 175 * @see #previous 176 * @see #last 177 * @stable ICU 2.0 178 */ 179 int32_t getMatchedLength(void) const; 180 181 /** 182 * Returns the text that was matched by the most recent call to 183 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 184 * If the iterator is not pointing at a valid match (e.g. just after 185 * construction or after <tt>USEARCH_DONE</tt> has been returned, 186 * returns an empty string. 187 * @param result stores the matched string or an empty string if a match 188 * is not found. 189 * @see #first 190 * @see #next 191 * @see #previous 192 * @see #last 193 * @stable ICU 2.0 194 */ 195 void getMatchedText(UnicodeString &result) const; 196 197 /** 198 * Set the BreakIterator that will be used to restrict the points 199 * at which matches are detected. The user is responsible for deleting 200 * the breakiterator. 201 * @param breakiter A BreakIterator that will be used to restrict the 202 * points at which matches are detected. If a match is 203 * found, but the match's start or end index is not a 204 * boundary as determined by the <tt>BreakIterator</tt>, 205 * the match will be rejected and another will be searched 206 * for. If this parameter is <tt>NULL</tt>, no break 207 * detection is attempted. 208 * @param status for errors if it occurs 209 * @see BreakIterator 210 * @stable ICU 2.0 211 */ 212 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); 213 214 /** 215 * Returns the BreakIterator that is used to restrict the points at 216 * which matches are detected. This will be the same object that was 217 * passed to the constructor or to <tt>setBreakIterator</tt>. 218 * Note that <tt>NULL</tt> is a legal value; it means that break 219 * detection should not be attempted. 220 * @return BreakIterator used to restrict matchings. 221 * @see #setBreakIterator 222 * @stable ICU 2.0 223 */ 224 const BreakIterator * getBreakIterator(void) const; 225 226 /** 227 * Set the string text to be searched. Text iteration will hence begin at 228 * the start of the text string. This method is useful if you want to 229 * re-use an iterator to search for the same pattern within a different 230 * body of text. The user is responsible for deleting the text. 231 * @param text string to be searched. 232 * @param status for errors. If the text length is 0, 233 * an U_ILLEGAL_ARGUMENT_ERROR is returned. 234 * @stable ICU 2.0 235 */ 236 virtual void setText(const UnicodeString &text, UErrorCode &status); 237 238 /** 239 * Set the string text to be searched. Text iteration will hence begin at 240 * the start of the text string. This method is useful if you want to 241 * re-use an iterator to search for the same pattern within a different 242 * body of text. 243 * <p> 244 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 245 * will be done during searching for this version. The block of text 246 * in <tt>CharacterIterator</tt> will be used as it is. 247 * The user is responsible for deleting the text. 248 * @param text string iterator to be searched. 249 * @param status for errors if any. If the text length is 0 then an 250 * U_ILLEGAL_ARGUMENT_ERROR is returned. 251 * @stable ICU 2.0 252 */ 253 virtual void setText(CharacterIterator &text, UErrorCode &status); 254 255 /** 256 * Return the string text to be searched. 257 * @return text string to be searched. 258 * @stable ICU 2.0 259 */ 260 const UnicodeString & getText(void) const; 261 262 // operator overloading ---------------------------------------------- 263 264 /** 265 * Equality operator. 266 * @param that SearchIterator instance to be compared. 267 * @return TRUE if both BreakIterators are of the same class, have the 268 * same behavior, terates over the same text and have the same 269 * attributes. FALSE otherwise. 270 * @stable ICU 2.0 271 */ 272 virtual UBool operator==(const SearchIterator &that) const; 273 274 /** 275 * Not-equal operator. 276 * @param that SearchIterator instance to be compared. 277 * @return FALSE if operator== returns TRUE, and vice versa. 278 * @stable ICU 2.0 279 */ 280 UBool operator!=(const SearchIterator &that) const; 281 282 // public methods ---------------------------------------------------- 283 284 /** 285 * Returns a copy of SearchIterator with the same behavior, and 286 * iterating over the same text, as this one. Note that all data will be 287 * replicated, except for the text string to be searched. 288 * @return cloned object 289 * @stable ICU 2.0 290 */ 291 virtual SearchIterator* safeClone(void) const = 0; 292 293 /** 294 * Returns the first index at which the string text matches the search 295 * pattern. The iterator is adjusted so that its current index (as 296 * returned by <tt>getOffset</tt>) is the match position if one 297 * was found. 298 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 299 * the iterator will be adjusted to the index USEARCH_DONE 300 * @param status for errors if it occurs 301 * @return The character index of the first match, or 302 * <tt>USEARCH_DONE</tt> if there are no matches. 303 * @see #getOffset 304 * @stable ICU 2.0 305 */ 306 int32_t first(UErrorCode &status); 307 308 /** 309 * Returns the first index equal or greater than <tt>position</tt> at which the 310 * string text matches the search pattern. The iterator is adjusted so 311 * that its current index (as returned by <tt>getOffset</tt>) is the 312 * match position if one was found. 313 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the 314 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 315 * @param position where search if to start from. If position is less 316 * than or greater than the text range for searching, 317 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 318 * @param status for errors if it occurs 319 * @return The character index of the first match following 320 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 321 * matches. 322 * @see #getOffset 323 * @stable ICU 2.0 324 */ 325 int32_t following(int32_t position, UErrorCode &status); 326 327 /** 328 * Returns the last index in the target text at which it matches the 329 * search pattern. The iterator is adjusted so that its current index 330 * (as returned by <tt>getOffset</tt>) is the match position if one was 331 * found. 332 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 333 * the iterator will be adjusted to the index USEARCH_DONE. 334 * @param status for errors if it occurs 335 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 336 * there are no matches. 337 * @see #getOffset 338 * @stable ICU 2.0 339 */ 340 int32_t last(UErrorCode &status); 341 342 /** 343 * Returns the first index less than <tt>position</tt> at which the string 344 * text matches the search pattern. The iterator is adjusted so that its 345 * current index (as returned by <tt>getOffset</tt>) is the match 346 * position if one was found. If a match is not found, 347 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 348 * adjusted to the index USEARCH_DONE 349 * <p> 350 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 351 * result match is always less than <tt>position</tt>. 352 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 353 * <tt>position</tt>. 354 * 355 * @param position where search is to start from. If position is less 356 * than or greater than the text range for searching, 357 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 358 * @param status for errors if it occurs 359 * @return The character index of the first match preceding 360 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 361 * no matches. 362 * @see #getOffset 363 * @stable ICU 2.0 364 */ 365 int32_t preceding(int32_t position, UErrorCode &status); 366 367 /** 368 * Returns the index of the next point at which the text matches the 369 * search pattern, starting from the current position 370 * The iterator is adjusted so that its current index (as returned by 371 * <tt>getOffset</tt>) is the match position if one was found. 372 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 373 * the iterator will be adjusted to a position after the end of the text 374 * string. 375 * @param status for errors if it occurs 376 * @return The index of the next match after the current position, 377 * or <tt>USEARCH_DONE</tt> if there are no more matches. 378 * @see #getOffset 379 * @stable ICU 2.0 380 */ 381 int32_t next(UErrorCode &status); 382 383 /** 384 * Returns the index of the previous point at which the string text 385 * matches the search pattern, starting at the current position. 386 * The iterator is adjusted so that its current index (as returned by 387 * <tt>getOffset</tt>) is the match position if one was found. 388 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 389 * the iterator will be adjusted to the index USEARCH_DONE 390 * @param status for errors if it occurs 391 * @return The index of the previous match before the current position, 392 * or <tt>USEARCH_DONE</tt> if there are no more matches. 393 * @see #getOffset 394 * @stable ICU 2.0 395 */ 396 int32_t previous(UErrorCode &status); 397 398 /** 399 * Resets the iteration. 400 * Search will begin at the start of the text string if a forward 401 * iteration is initiated before a backwards iteration. Otherwise if a 402 * backwards iteration is initiated before a forwards iteration, the 403 * search will begin at the end of the text string. 404 * @stable ICU 2.0 405 */ 406 virtual void reset(); 407 408protected: 409 // protected data members --------------------------------------------- 410 411 /** 412 * C search data struct 413 * @stable ICU 2.0 414 */ 415 USearch *m_search_; 416 417 /** 418 * Break iterator. 419 * Currently the C++ breakiterator does not have getRules etc to reproduce 420 * another in C. Hence we keep the original around and do the verification 421 * at the end of the match. The user is responsible for deleting this 422 * break iterator. 423 * @stable ICU 2.0 424 */ 425 BreakIterator *m_breakiterator_; 426 427 /** 428 * Unicode string version of the search text 429 * @stable ICU 2.0 430 */ 431 UnicodeString m_text_; 432 433 // protected constructors and destructors ----------------------------- 434 435 /** 436 * Default constructor. 437 * Initializes data to the default values. 438 * @stable ICU 2.0 439 */ 440 SearchIterator(); 441 442 /** 443 * Constructor for use by subclasses. 444 * @param text The target text to be searched. 445 * @param breakiter A {@link BreakIterator} that is used to restrict the 446 * points at which matches are detected. If 447 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 448 * match, but the match's start or end index is not a 449 * boundary as determined by the <tt>BreakIterator</tt>, 450 * the match is rejected and <tt>handleNext</tt> or 451 * <tt>handlePrev</tt> is called again. If this parameter 452 * is <tt>NULL</tt>, no break detection is attempted. 453 * @see #handleNext 454 * @see #handlePrev 455 * @stable ICU 2.0 456 */ 457 SearchIterator(const UnicodeString &text, 458 BreakIterator *breakiter = NULL); 459 460 /** 461 * Constructor for use by subclasses. 462 * <p> 463 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 464 * will be done during searching for this version. The block of text 465 * in <tt>CharacterIterator</tt> will be used as it is. 466 * @param text The target text to be searched. 467 * @param breakiter A {@link BreakIterator} that is used to restrict the 468 * points at which matches are detected. If 469 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 470 * match, but the match's start or end index is not a 471 * boundary as determined by the <tt>BreakIterator</tt>, 472 * the match is rejected and <tt>handleNext</tt> or 473 * <tt>handlePrev</tt> is called again. If this parameter 474 * is <tt>NULL</tt>, no break detection is attempted. 475 * @see #handleNext 476 * @see #handlePrev 477 * @stable ICU 2.0 478 */ 479 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); 480 481 // protected methods -------------------------------------------------- 482 483 /** 484 * Assignment operator. Sets this iterator to have the same behavior, 485 * and iterate over the same text, as the one passed in. 486 * @param that instance to be copied. 487 * @stable ICU 2.0 488 */ 489 SearchIterator & operator=(const SearchIterator &that); 490 491 /** 492 * Abstract method which subclasses override to provide the mechanism 493 * for finding the next match in the target text. This allows different 494 * subclasses to provide different search algorithms. 495 * <p> 496 * If a match is found, the implementation should return the index at 497 * which the match starts and should call 498 * <tt>setMatchLength</tt> with the number of characters 499 * in the target text that make up the match. If no match is found, the 500 * method should return USEARCH_DONE. 501 * <p> 502 * @param position The index in the target text at which the search 503 * should start. 504 * @param status for error codes if it occurs. 505 * @return index at which the match starts, else if match is not found 506 * USEARCH_DONE is returned 507 * @see #setMatchLength 508 * @stable ICU 2.0 509 */ 510 virtual int32_t handleNext(int32_t position, UErrorCode &status) 511 = 0; 512 513 /** 514 * Abstract method which subclasses override to provide the mechanism for 515 * finding the previous match in the target text. This allows different 516 * subclasses to provide different search algorithms. 517 * <p> 518 * If a match is found, the implementation should return the index at 519 * which the match starts and should call 520 * <tt>setMatchLength</tt> with the number of characters 521 * in the target text that make up the match. If no match is found, the 522 * method should return USEARCH_DONE. 523 * <p> 524 * @param position The index in the target text at which the search 525 * should start. 526 * @param status for error codes if it occurs. 527 * @return index at which the match starts, else if match is not found 528 * USEARCH_DONE is returned 529 * @see #setMatchLength 530 * @stable ICU 2.0 531 */ 532 virtual int32_t handlePrev(int32_t position, UErrorCode &status) 533 = 0; 534 535 /** 536 * Sets the length of the currently matched string in the text string to 537 * be searched. 538 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 539 * methods should call this when they find a match in the target text. 540 * @param length length of the matched text. 541 * @see #handleNext 542 * @see #handlePrev 543 * @stable ICU 2.0 544 */ 545 virtual void setMatchLength(int32_t length); 546 547 /** 548 * Sets the offset of the currently matched string in the text string to 549 * be searched. 550 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 551 * methods should call this when they find a match in the target text. 552 * @param position start offset of the matched text. 553 * @see #handleNext 554 * @see #handlePrev 555 * @stable ICU 2.0 556 */ 557 virtual void setMatchStart(int32_t position); 558 559 /** 560 * sets match not found 561 * @stable ICU 2.0 562 */ 563 void setMatchNotFound(); 564}; 565 566inline UBool SearchIterator::operator!=(const SearchIterator &that) const 567{ 568 return !operator==(that); 569} 570U_NAMESPACE_END 571 572#endif /* #if !UCONFIG_NO_COLLATION */ 573 574#endif 575 576