1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4 ****************************************************************************** 5 * Copyright (C) 1997-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10/** 11 * \file 12 * \brief C++ API: Collation Element Iterator. 13 */ 14 15/** 16* File coleitr.h 17* 18* Created by: Helena Shih 19* 20* Modification History: 21* 22* Date Name Description 23* 24* 8/18/97 helena Added internal API documentation. 25* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java 26* 12/10/99 aliu Ported Thai collation support from Java. 27* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h) 28* 02/19/01 swquek Removed CollationElementsIterator() since it is 29* private constructor and no calls are made to it 30* 2012-2014 markus Rewritten in C++ again. 31*/ 32 33#ifndef COLEITR_H 34#define COLEITR_H 35 36#include "unicode/utypes.h" 37 38#if !UCONFIG_NO_COLLATION 39 40#include "unicode/unistr.h" 41#include "unicode/uobject.h" 42 43struct UCollationElements; 44struct UHashtable; 45 46U_NAMESPACE_BEGIN 47 48struct CollationData; 49 50class CollationIterator; 51class RuleBasedCollator; 52class UCollationPCE; 53class UVector32; 54 55/** 56* The CollationElementIterator class is used as an iterator to walk through 57* each character of an international string. Use the iterator to return the 58* ordering priority of the positioned character. The ordering priority of a 59* character, which we refer to as a key, defines how a character is collated in 60* the given collation object. 61* For example, consider the following in Slovak and in traditional Spanish collation: 62* <pre> 63* "ca" -> the first key is key('c') and second key is key('a'). 64* "cha" -> the first key is key('ch') and second key is key('a').</pre> 65* And in German phonebook collation, 66* <pre> \htmlonly "æb"-> the first key is key('a'), the second key is key('e'), and 67* the third key is key('b'). \endhtmlonly </pre> 68* The key of a character, is an integer composed of primary order(short), 69* secondary order(char), and tertiary order(char). Java strictly defines the 70* size and signedness of its primitive data types. Therefore, the static 71* functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return 72* int32_t to ensure the correctness of the key value. 73* <p>Example of the iterator usage: (without error checking) 74* <pre> 75* \code 76* void CollationElementIterator_Example() 77* { 78* UnicodeString str = "This is a test"; 79* UErrorCode success = U_ZERO_ERROR; 80* RuleBasedCollator* rbc = 81* (RuleBasedCollator*) RuleBasedCollator::createInstance(success); 82* CollationElementIterator* c = 83* rbc->createCollationElementIterator( str ); 84* int32_t order = c->next(success); 85* c->reset(); 86* order = c->previous(success); 87* delete c; 88* delete rbc; 89* } 90* \endcode 91* </pre> 92* <p> 93* The method next() returns the collation order of the next character based on 94* the comparison level of the collator. The method previous() returns the 95* collation order of the previous character based on the comparison level of 96* the collator. The Collation Element Iterator moves only in one direction 97* between calls to reset(), setOffset(), or setText(). That is, next() 98* and previous() can not be inter-used. Whenever previous() is to be called after 99* next() or vice versa, reset(), setOffset() or setText() has to be called first 100* to reset the status, shifting pointers to either the end or the start of 101* the string (reset() or setText()), or the specified position (setOffset()). 102* Hence at the next call of next() or previous(), the first or last collation order, 103* or collation order at the spefcifieid position will be returned. If a change of 104* direction is done without one of these calls, the result is undefined. 105* <p> 106* The result of a forward iterate (next()) and reversed result of the backward 107* iterate (previous()) on the same string are equivalent, if collation orders 108* with the value 0 are ignored. 109* Character based on the comparison level of the collator. A collation order 110* consists of primary order, secondary order and tertiary order. The data 111* type of the collation order is <strong>int32_t</strong>. 112* 113* Note, CollationElementIterator should not be subclassed. 114* @see Collator 115* @see RuleBasedCollator 116* @version 1.8 Jan 16 2001 117*/ 118class U_I18N_API CollationElementIterator U_FINAL : public UObject { 119public: 120 121 // CollationElementIterator public data member ------------------------------ 122 123 enum { 124 /** 125 * NULLORDER indicates that an error has occured while processing 126 * @stable ICU 2.0 127 */ 128 NULLORDER = (int32_t)0xffffffff 129 }; 130 131 // CollationElementIterator public constructor/destructor ------------------- 132 133 /** 134 * Copy constructor. 135 * 136 * @param other the object to be copied from 137 * @stable ICU 2.0 138 */ 139 CollationElementIterator(const CollationElementIterator& other); 140 141 /** 142 * Destructor 143 * @stable ICU 2.0 144 */ 145 virtual ~CollationElementIterator(); 146 147 // CollationElementIterator public methods ---------------------------------- 148 149 /** 150 * Returns true if "other" is the same as "this" 151 * 152 * @param other the object to be compared 153 * @return true if "other" is the same as "this" 154 * @stable ICU 2.0 155 */ 156 UBool operator==(const CollationElementIterator& other) const; 157 158 /** 159 * Returns true if "other" is not the same as "this". 160 * 161 * @param other the object to be compared 162 * @return true if "other" is not the same as "this" 163 * @stable ICU 2.0 164 */ 165 UBool operator!=(const CollationElementIterator& other) const; 166 167 /** 168 * Resets the cursor to the beginning of the string. 169 * @stable ICU 2.0 170 */ 171 void reset(void); 172 173 /** 174 * Gets the ordering priority of the next character in the string. 175 * @param status the error code status. 176 * @return the next character's ordering. otherwise returns NULLORDER if an 177 * error has occured or if the end of string has been reached 178 * @stable ICU 2.0 179 */ 180 int32_t next(UErrorCode& status); 181 182 /** 183 * Get the ordering priority of the previous collation element in the string. 184 * @param status the error code status. 185 * @return the previous element's ordering. otherwise returns NULLORDER if an 186 * error has occured or if the start of string has been reached 187 * @stable ICU 2.0 188 */ 189 int32_t previous(UErrorCode& status); 190 191 /** 192 * Gets the primary order of a collation order. 193 * @param order the collation order 194 * @return the primary order of a collation order. 195 * @stable ICU 2.0 196 */ 197 static inline int32_t primaryOrder(int32_t order); 198 199 /** 200 * Gets the secondary order of a collation order. 201 * @param order the collation order 202 * @return the secondary order of a collation order. 203 * @stable ICU 2.0 204 */ 205 static inline int32_t secondaryOrder(int32_t order); 206 207 /** 208 * Gets the tertiary order of a collation order. 209 * @param order the collation order 210 * @return the tertiary order of a collation order. 211 * @stable ICU 2.0 212 */ 213 static inline int32_t tertiaryOrder(int32_t order); 214 215 /** 216 * Return the maximum length of any expansion sequences that end with the 217 * specified comparison order. 218 * @param order a collation order returned by previous or next. 219 * @return maximum size of the expansion sequences ending with the collation 220 * element or 1 if collation element does not occur at the end of any 221 * expansion sequence 222 * @stable ICU 2.0 223 */ 224 int32_t getMaxExpansion(int32_t order) const; 225 226 /** 227 * Gets the comparison order in the desired strength. Ignore the other 228 * differences. 229 * @param order The order value 230 * @stable ICU 2.0 231 */ 232 int32_t strengthOrder(int32_t order) const; 233 234 /** 235 * Sets the source string. 236 * @param str the source string. 237 * @param status the error code status. 238 * @stable ICU 2.0 239 */ 240 void setText(const UnicodeString& str, UErrorCode& status); 241 242 /** 243 * Sets the source string. 244 * @param str the source character iterator. 245 * @param status the error code status. 246 * @stable ICU 2.0 247 */ 248 void setText(CharacterIterator& str, UErrorCode& status); 249 250 /** 251 * Checks if a comparison order is ignorable. 252 * @param order the collation order. 253 * @return TRUE if a character is ignorable, FALSE otherwise. 254 * @stable ICU 2.0 255 */ 256 static inline UBool isIgnorable(int32_t order); 257 258 /** 259 * Gets the offset of the currently processed character in the source string. 260 * @return the offset of the character. 261 * @stable ICU 2.0 262 */ 263 int32_t getOffset(void) const; 264 265 /** 266 * Sets the offset of the currently processed character in the source string. 267 * @param newOffset the new offset. 268 * @param status the error code status. 269 * @return the offset of the character. 270 * @stable ICU 2.0 271 */ 272 void setOffset(int32_t newOffset, UErrorCode& status); 273 274 /** 275 * ICU "poor man's RTTI", returns a UClassID for the actual class. 276 * 277 * @stable ICU 2.2 278 */ 279 virtual UClassID getDynamicClassID() const; 280 281 /** 282 * ICU "poor man's RTTI", returns a UClassID for this class. 283 * 284 * @stable ICU 2.2 285 */ 286 static UClassID U_EXPORT2 getStaticClassID(); 287 288#ifndef U_HIDE_INTERNAL_API 289 /** @internal */ 290 static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) { 291 return reinterpret_cast<CollationElementIterator *>(uc); 292 } 293 /** @internal */ 294 static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) { 295 return reinterpret_cast<const CollationElementIterator *>(uc); 296 } 297 /** @internal */ 298 inline UCollationElements *toUCollationElements() { 299 return reinterpret_cast<UCollationElements *>(this); 300 } 301 /** @internal */ 302 inline const UCollationElements *toUCollationElements() const { 303 return reinterpret_cast<const UCollationElements *>(this); 304 } 305#endif // U_HIDE_INTERNAL_API 306 307private: 308 friend class RuleBasedCollator; 309 friend class UCollationPCE; 310 311 /** 312 * CollationElementIterator constructor. This takes the source string and the 313 * collation object. The cursor will walk thru the source string based on the 314 * predefined collation rules. If the source string is empty, NULLORDER will 315 * be returned on the calls to next(). 316 * @param sourceText the source string. 317 * @param order the collation object. 318 * @param status the error code status. 319 */ 320 CollationElementIterator(const UnicodeString& sourceText, 321 const RuleBasedCollator* order, UErrorCode& status); 322 // Note: The constructors should take settings & tailoring, not a collator, 323 // to avoid circular dependencies. 324 // However, for operator==() we would need to be able to compare tailoring data for equality 325 // without making CollationData or CollationTailoring depend on TailoredSet. 326 // (See the implementation of RuleBasedCollator::operator==().) 327 // That might require creating an intermediate class that would be used 328 // by both CollationElementIterator and RuleBasedCollator 329 // but only contain the part of RBC== related to data and rules. 330 331 /** 332 * CollationElementIterator constructor. This takes the source string and the 333 * collation object. The cursor will walk thru the source string based on the 334 * predefined collation rules. If the source string is empty, NULLORDER will 335 * be returned on the calls to next(). 336 * @param sourceText the source string. 337 * @param order the collation object. 338 * @param status the error code status. 339 */ 340 CollationElementIterator(const CharacterIterator& sourceText, 341 const RuleBasedCollator* order, UErrorCode& status); 342 343 /** 344 * Assignment operator 345 * 346 * @param other the object to be copied 347 */ 348 const CollationElementIterator& 349 operator=(const CollationElementIterator& other); 350 351 CollationElementIterator(); // default constructor not implemented 352 353 /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */ 354 inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; } 355 356 static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode); 357 358 static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order); 359 360 // CollationElementIterator private data members ---------------------------- 361 362 CollationIterator *iter_; // owned 363 const RuleBasedCollator *rbc_; // aliased 364 uint32_t otherHalf_; 365 /** 366 * <0: backwards; 0: just after reset() (previous() begins from end); 367 * 1: just after setOffset(); >1: forward 368 */ 369 int8_t dir_; 370 /** 371 * Stores offsets from expansions and from unsafe-backwards iteration, 372 * so that getOffset() returns intermediate offsets for the CEs 373 * that are consistent with forward iteration. 374 */ 375 UVector32 *offsets_; 376 377 UnicodeString string_; 378}; 379 380// CollationElementIterator inline method definitions -------------------------- 381 382inline int32_t CollationElementIterator::primaryOrder(int32_t order) 383{ 384 return (order >> 16) & 0xffff; 385} 386 387inline int32_t CollationElementIterator::secondaryOrder(int32_t order) 388{ 389 return (order >> 8) & 0xff; 390} 391 392inline int32_t CollationElementIterator::tertiaryOrder(int32_t order) 393{ 394 return order & 0xff; 395} 396 397inline UBool CollationElementIterator::isIgnorable(int32_t order) 398{ 399 return (order & 0xffff0000) == 0; 400} 401 402U_NAMESPACE_END 403 404#endif /* #if !UCONFIG_NO_COLLATION */ 405 406#endif 407