1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4 ****************************************************************************** 5 * Copyright (C) 1997-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10/** 11 * \file 12 * \brief C++ API: Collation Element Iterator. 13 */ 14 15/** 16* File coleitr.h 17* 18* Created by: Helena Shih 19* 20* Modification History: 21* 22* Date Name Description 23* 24* 8/18/97 helena Added internal API documentation. 25* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java 26* 12/10/99 aliu Ported Thai collation support from Java. 27* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h) 28* 02/19/01 swquek Removed CollationElementsIterator() since it is 29* private constructor and no calls are made to it 30* 2012-2014 markus Rewritten in C++ again. 31*/ 32 33#ifndef COLEITR_H 34#define COLEITR_H 35 36#include "unicode/utypes.h" 37 38#if !UCONFIG_NO_COLLATION 39 40#include "unicode/unistr.h" 41#include "unicode/uobject.h" 42 43struct UCollationElements; 44struct UHashtable; 45 46U_NAMESPACE_BEGIN 47 48struct CollationData; 49 50class CharacterIterator; 51class CollationIterator; 52class RuleBasedCollator; 53class UCollationPCE; 54class UVector32; 55 56/** 57* The CollationElementIterator class is used as an iterator to walk through 58* each character of an international string. Use the iterator to return the 59* ordering priority of the positioned character. The ordering priority of a 60* character, which we refer to as a key, defines how a character is collated in 61* the given collation object. 62* For example, consider the following in Slovak and in traditional Spanish collation: 63* <pre> 64* "ca" -> the first key is key('c') and second key is key('a'). 65* "cha" -> the first key is key('ch') and second key is key('a').</pre> 66* And in German phonebook collation, 67* <pre> \htmlonly "æb"-> the first key is key('a'), the second key is key('e'), and 68* the third key is key('b'). \endhtmlonly </pre> 69* The key of a character, is an integer composed of primary order(short), 70* secondary order(char), and tertiary order(char). Java strictly defines the 71* size and signedness of its primitive data types. Therefore, the static 72* functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return 73* int32_t to ensure the correctness of the key value. 74* <p>Example of the iterator usage: (without error checking) 75* <pre> 76* \code 77* void CollationElementIterator_Example() 78* { 79* UnicodeString str = "This is a test"; 80* UErrorCode success = U_ZERO_ERROR; 81* RuleBasedCollator* rbc = 82* (RuleBasedCollator*) RuleBasedCollator::createInstance(success); 83* CollationElementIterator* c = 84* rbc->createCollationElementIterator( str ); 85* int32_t order = c->next(success); 86* c->reset(); 87* order = c->previous(success); 88* delete c; 89* delete rbc; 90* } 91* \endcode 92* </pre> 93* <p> 94* The method next() returns the collation order of the next character based on 95* the comparison level of the collator. The method previous() returns the 96* collation order of the previous character based on the comparison level of 97* the collator. The Collation Element Iterator moves only in one direction 98* between calls to reset(), setOffset(), or setText(). That is, next() 99* and previous() can not be inter-used. Whenever previous() is to be called after 100* next() or vice versa, reset(), setOffset() or setText() has to be called first 101* to reset the status, shifting pointers to either the end or the start of 102* the string (reset() or setText()), or the specified position (setOffset()). 103* Hence at the next call of next() or previous(), the first or last collation order, 104* or collation order at the spefcifieid position will be returned. If a change of 105* direction is done without one of these calls, the result is undefined. 106* <p> 107* The result of a forward iterate (next()) and reversed result of the backward 108* iterate (previous()) on the same string are equivalent, if collation orders 109* with the value 0 are ignored. 110* Character based on the comparison level of the collator. A collation order 111* consists of primary order, secondary order and tertiary order. The data 112* type of the collation order is <strong>int32_t</strong>. 113* 114* Note, CollationElementIterator should not be subclassed. 115* @see Collator 116* @see RuleBasedCollator 117* @version 1.8 Jan 16 2001 118*/ 119class U_I18N_API CollationElementIterator U_FINAL : public UObject { 120public: 121 122 // CollationElementIterator public data member ------------------------------ 123 124 enum { 125 /** 126 * NULLORDER indicates that an error has occured while processing 127 * @stable ICU 2.0 128 */ 129 NULLORDER = (int32_t)0xffffffff 130 }; 131 132 // CollationElementIterator public constructor/destructor ------------------- 133 134 /** 135 * Copy constructor. 136 * 137 * @param other the object to be copied from 138 * @stable ICU 2.0 139 */ 140 CollationElementIterator(const CollationElementIterator& other); 141 142 /** 143 * Destructor 144 * @stable ICU 2.0 145 */ 146 virtual ~CollationElementIterator(); 147 148 // CollationElementIterator public methods ---------------------------------- 149 150 /** 151 * Returns true if "other" is the same as "this" 152 * 153 * @param other the object to be compared 154 * @return true if "other" is the same as "this" 155 * @stable ICU 2.0 156 */ 157 UBool operator==(const CollationElementIterator& other) const; 158 159 /** 160 * Returns true if "other" is not the same as "this". 161 * 162 * @param other the object to be compared 163 * @return true if "other" is not the same as "this" 164 * @stable ICU 2.0 165 */ 166 UBool operator!=(const CollationElementIterator& other) const; 167 168 /** 169 * Resets the cursor to the beginning of the string. 170 * @stable ICU 2.0 171 */ 172 void reset(void); 173 174 /** 175 * Gets the ordering priority of the next character in the string. 176 * @param status the error code status. 177 * @return the next character's ordering. otherwise returns NULLORDER if an 178 * error has occured or if the end of string has been reached 179 * @stable ICU 2.0 180 */ 181 int32_t next(UErrorCode& status); 182 183 /** 184 * Get the ordering priority of the previous collation element in the string. 185 * @param status the error code status. 186 * @return the previous element's ordering. otherwise returns NULLORDER if an 187 * error has occured or if the start of string has been reached 188 * @stable ICU 2.0 189 */ 190 int32_t previous(UErrorCode& status); 191 192 /** 193 * Gets the primary order of a collation order. 194 * @param order the collation order 195 * @return the primary order of a collation order. 196 * @stable ICU 2.0 197 */ 198 static inline int32_t primaryOrder(int32_t order); 199 200 /** 201 * Gets the secondary order of a collation order. 202 * @param order the collation order 203 * @return the secondary order of a collation order. 204 * @stable ICU 2.0 205 */ 206 static inline int32_t secondaryOrder(int32_t order); 207 208 /** 209 * Gets the tertiary order of a collation order. 210 * @param order the collation order 211 * @return the tertiary order of a collation order. 212 * @stable ICU 2.0 213 */ 214 static inline int32_t tertiaryOrder(int32_t order); 215 216 /** 217 * Return the maximum length of any expansion sequences that end with the 218 * specified comparison order. 219 * @param order a collation order returned by previous or next. 220 * @return maximum size of the expansion sequences ending with the collation 221 * element or 1 if collation element does not occur at the end of any 222 * expansion sequence 223 * @stable ICU 2.0 224 */ 225 int32_t getMaxExpansion(int32_t order) const; 226 227 /** 228 * Gets the comparison order in the desired strength. Ignore the other 229 * differences. 230 * @param order The order value 231 * @stable ICU 2.0 232 */ 233 int32_t strengthOrder(int32_t order) const; 234 235 /** 236 * Sets the source string. 237 * @param str the source string. 238 * @param status the error code status. 239 * @stable ICU 2.0 240 */ 241 void setText(const UnicodeString& str, UErrorCode& status); 242 243 /** 244 * Sets the source string. 245 * @param str the source character iterator. 246 * @param status the error code status. 247 * @stable ICU 2.0 248 */ 249 void setText(CharacterIterator& str, UErrorCode& status); 250 251 /** 252 * Checks if a comparison order is ignorable. 253 * @param order the collation order. 254 * @return TRUE if a character is ignorable, FALSE otherwise. 255 * @stable ICU 2.0 256 */ 257 static inline UBool isIgnorable(int32_t order); 258 259 /** 260 * Gets the offset of the currently processed character in the source string. 261 * @return the offset of the character. 262 * @stable ICU 2.0 263 */ 264 int32_t getOffset(void) const; 265 266 /** 267 * Sets the offset of the currently processed character in the source string. 268 * @param newOffset the new offset. 269 * @param status the error code status. 270 * @return the offset of the character. 271 * @stable ICU 2.0 272 */ 273 void setOffset(int32_t newOffset, UErrorCode& status); 274 275 /** 276 * ICU "poor man's RTTI", returns a UClassID for the actual class. 277 * 278 * @stable ICU 2.2 279 */ 280 virtual UClassID getDynamicClassID() const; 281 282 /** 283 * ICU "poor man's RTTI", returns a UClassID for this class. 284 * 285 * @stable ICU 2.2 286 */ 287 static UClassID U_EXPORT2 getStaticClassID(); 288 289#ifndef U_HIDE_INTERNAL_API 290 /** @internal */ 291 static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) { 292 return reinterpret_cast<CollationElementIterator *>(uc); 293 } 294 /** @internal */ 295 static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) { 296 return reinterpret_cast<const CollationElementIterator *>(uc); 297 } 298 /** @internal */ 299 inline UCollationElements *toUCollationElements() { 300 return reinterpret_cast<UCollationElements *>(this); 301 } 302 /** @internal */ 303 inline const UCollationElements *toUCollationElements() const { 304 return reinterpret_cast<const UCollationElements *>(this); 305 } 306#endif // U_HIDE_INTERNAL_API 307 308private: 309 friend class RuleBasedCollator; 310 friend class UCollationPCE; 311 312 /** 313 * CollationElementIterator constructor. This takes the source string and the 314 * collation object. The cursor will walk thru the source string based on the 315 * predefined collation rules. If the source string is empty, NULLORDER will 316 * be returned on the calls to next(). 317 * @param sourceText the source string. 318 * @param order the collation object. 319 * @param status the error code status. 320 */ 321 CollationElementIterator(const UnicodeString& sourceText, 322 const RuleBasedCollator* order, UErrorCode& status); 323 // Note: The constructors should take settings & tailoring, not a collator, 324 // to avoid circular dependencies. 325 // However, for operator==() we would need to be able to compare tailoring data for equality 326 // without making CollationData or CollationTailoring depend on TailoredSet. 327 // (See the implementation of RuleBasedCollator::operator==().) 328 // That might require creating an intermediate class that would be used 329 // by both CollationElementIterator and RuleBasedCollator 330 // but only contain the part of RBC== related to data and rules. 331 332 /** 333 * CollationElementIterator constructor. This takes the source string and the 334 * collation object. The cursor will walk thru the source string based on the 335 * predefined collation rules. If the source string is empty, NULLORDER will 336 * be returned on the calls to next(). 337 * @param sourceText the source string. 338 * @param order the collation object. 339 * @param status the error code status. 340 */ 341 CollationElementIterator(const CharacterIterator& sourceText, 342 const RuleBasedCollator* order, UErrorCode& status); 343 344 /** 345 * Assignment operator 346 * 347 * @param other the object to be copied 348 */ 349 const CollationElementIterator& 350 operator=(const CollationElementIterator& other); 351 352 CollationElementIterator(); // default constructor not implemented 353 354 /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */ 355 inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; } 356 357 static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode); 358 359 static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order); 360 361 // CollationElementIterator private data members ---------------------------- 362 363 CollationIterator *iter_; // owned 364 const RuleBasedCollator *rbc_; // aliased 365 uint32_t otherHalf_; 366 /** 367 * <0: backwards; 0: just after reset() (previous() begins from end); 368 * 1: just after setOffset(); >1: forward 369 */ 370 int8_t dir_; 371 /** 372 * Stores offsets from expansions and from unsafe-backwards iteration, 373 * so that getOffset() returns intermediate offsets for the CEs 374 * that are consistent with forward iteration. 375 */ 376 UVector32 *offsets_; 377 378 UnicodeString string_; 379}; 380 381// CollationElementIterator inline method definitions -------------------------- 382 383inline int32_t CollationElementIterator::primaryOrder(int32_t order) 384{ 385 return (order >> 16) & 0xffff; 386} 387 388inline int32_t CollationElementIterator::secondaryOrder(int32_t order) 389{ 390 return (order >> 8) & 0xff; 391} 392 393inline int32_t CollationElementIterator::tertiaryOrder(int32_t order) 394{ 395 return order & 0xff; 396} 397 398inline UBool CollationElementIterator::isIgnorable(int32_t order) 399{ 400 return (order & 0xffff0000) == 0; 401} 402 403U_NAMESPACE_END 404 405#endif /* #if !UCONFIG_NO_COLLATION */ 406 407#endif 408